diff --git a/fluid/DeepASR/data_utils/async_data_reader.py b/fluid/DeepASR/data_utils/async_data_reader.py
index 1515b299d4357eac16892dceda6e4f05bf1fc045..731c55de71e8d4b7db156f1ae72172c36eb1be7a 100644
--- a/fluid/DeepASR/data_utils/async_data_reader.py
+++ b/fluid/DeepASR/data_utils/async_data_reader.py
@@ -15,9 +15,7 @@ from multiprocessing import Manager, Process
 import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
 import data_utils.augmentor.trans_add_delta as trans_add_delta
 from data_utils.util import suppress_complaints, suppress_signal
-from data_utils.util import SharedNDArray, SharedMemoryPoolManager
-from data_utils.util import DaemonProcessGroup, batch_to_ndarray
-from data_utils.util import CriticalException, ForceExitWrapper, EpochEndSignal
+from data_utils.util import CriticalException, ForceExitWrapper
 
 
 class SampleInfo(object):
@@ -32,11 +30,12 @@ class SampleInfo(object):
         label_bin_path (str): File containing the label data.
         label_size (int): Byte count of the sample's label data.
         label_frame_num (int): Label number of the sample.
+        sample_name (str): Key of the sample
     """
 
     def __init__(self, feature_bin_path, feature_start, feature_size,
                  feature_frame_num, feature_dim, label_bin_path, label_start,
-                 label_size, label_frame_num):
+                 label_size, label_frame_num, sample_name):
         self.feature_bin_path = feature_bin_path
         self.feature_start = feature_start
         self.feature_size = feature_size
@@ -47,6 +46,7 @@ class SampleInfo(object):
         self.label_start = label_start
         self.label_size = label_size
         self.label_frame_num = label_frame_num
+        self.sample_name = sample_name
 
 
 class SampleInfoBucket(object):
@@ -69,8 +69,8 @@ class SampleInfoBucket(object):
         split_sentence_threshold(int): Sentence whose length larger than
                                 the value will trigger split operation.
         split_sub_sentence_len(int): sub-sentence length is equal to
-                                    (split_sub_sentence_len + \
-                                     rand() % split_perturb).
+                                    (split_sub_sentence_len
+                                     + rand() % split_perturb).
     """
 
     def __init__(self,
@@ -104,24 +104,33 @@ class SampleInfoBucket(object):
             feature_bin_path = self._feature_bin_paths[block_idx]
             feature_desc_path = self._feature_desc_paths[block_idx]
 
-            label_desc_lines = open(label_desc_path).readlines()
             feature_desc_lines = open(feature_desc_path).readlines()
 
-            sample_num = int(label_desc_lines[0].split()[1])
-            assert sample_num == int(feature_desc_lines[0].split()[1])
+            label_desc_lines = []
+            if label_desc_path != "":
+                label_desc_lines = open(label_desc_path).readlines()
+            sample_num = int(feature_desc_lines[0].split()[1])
+
+            if label_desc_path != "":
+                assert sample_num == int(label_desc_lines[0].split()[1])
 
             for i in xrange(sample_num):
                 feature_desc_split = feature_desc_lines[i + 1].split()
+                sample_name = feature_desc_split[0]
                 feature_start = int(feature_desc_split[2])
                 feature_size = int(feature_desc_split[3])
                 feature_frame_num = int(feature_desc_split[4])
                 feature_dim = int(feature_desc_split[5])
 
-                label_desc_split = label_desc_lines[i + 1].split()
-                label_start = int(label_desc_split[2])
-                label_size = int(label_desc_split[3])
-                label_frame_num = int(label_desc_split[4])
-                assert feature_frame_num == label_frame_num
+                label_start = -1
+                label_size = -1
+                label_frame_num = feature_frame_num
+                if label_desc_path != "":
+                    label_desc_split = label_desc_lines[i + 1].split()
+                    label_start = int(label_desc_split[2])
+                    label_size = int(label_desc_split[3])
+                    label_frame_num = int(label_desc_split[4])
+                    assert feature_frame_num == label_frame_num
 
                 if self._split_sentence_threshold == -1 or \
                         self._split_perturb == -1 or \
@@ -131,7 +140,7 @@ class SampleInfoBucket(object):
                         SampleInfo(feature_bin_path, feature_start,
                                    feature_size, feature_frame_num, feature_dim,
                                    label_bin_path, label_start, label_size,
-                                   label_frame_num))
+                                   label_frame_num, sample_name))
                 #split sentence
                 else:
                     cur_frame_pos = 0
@@ -152,16 +161,19 @@ class SampleInfoBucket(object):
                                 * feature_dim * 4, cur_frame_len * feature_dim *
                                 4, cur_frame_len, feature_dim, label_bin_path,
                                 label_start + cur_frame_pos * 4, cur_frame_len *
-                                4, cur_frame_len))
+                                4, cur_frame_len, sample_name))
 
                         remain_frame_num -= cur_frame_len
                         cur_frame_pos += cur_frame_len
                         if remain_frame_num <= 0:
                             break
-
         return sample_info_list
 
 
+class EpochEndSignal():
+    pass
+
+
 class AsyncDataReader(object):
     """DataReader provides basic audio sample preprocessing pipeline including
     data loading and data augmentation.
@@ -190,7 +202,7 @@ class AsyncDataReader(object):
 
     def __init__(self,
                  feature_file_list,
-                 label_file_list,
+                 label_file_list="",
                  drop_frame_len=512,
                  proc_num=10,
                  sample_buffer_size=1024,
@@ -213,27 +225,30 @@ class AsyncDataReader(object):
         self._sample_info_buffer_size = sample_info_buffer_size
         self._batch_buffer_size = batch_buffer_size
         self._proc_num = proc_num
-        if self._proc_num <= 2:
-            raise ValueError("Value of `proc_num` should be greater than 2.")
-        self._sample_proc_num = self._proc_num - 2
         self._verbose = verbose
         self._force_exit = ForceExitWrapper(self._manager.Value('b', False))
-        self._pool_manager = SharedMemoryPoolManager(self._batch_buffer_size *
-                                                     3, self._manager)
 
     def generate_bucket_list(self, is_shuffle):
         if self._block_info_list is None:
             block_feature_info_lines = open(self._feature_file_list).readlines()
-            block_label_info_lines = open(self._label_file_list).readlines()
-            assert len(block_feature_info_lines) == len(block_label_info_lines)
             self._block_info_list = []
-            for i in xrange(0, len(block_feature_info_lines), 2):
-                block_info = (block_feature_info_lines[i],
-                              block_feature_info_lines[i + 1],
-                              block_label_info_lines[i],
-                              block_label_info_lines[i + 1])
-                self._block_info_list.append(
-                    map(lambda line: line.strip(), block_info))
+            if self._label_file_list != "":
+                block_label_info_lines = open(self._label_file_list).readlines()
+                assert len(block_feature_info_lines) == len(
+                    block_label_info_lines)
+                for i in xrange(0, len(block_feature_info_lines), 2):
+                    block_info = (block_feature_info_lines[i],
+                                  block_feature_info_lines[i + 1],
+                                  block_label_info_lines[i],
+                                  block_label_info_lines[i + 1])
+                    self._block_info_list.append(
+                        map(lambda line: line.strip(), block_info))
+            else:
+                for i in xrange(0, len(block_feature_info_lines), 2):
+                    block_info = (block_feature_info_lines[i],
+                                  block_feature_info_lines[i + 1], "", "")
+                    self._block_info_list.append(
+                        map(lambda line: line.strip(), block_info))
 
         if is_shuffle:
             self._rng.shuffle(self._block_info_list)
@@ -253,23 +268,13 @@ class AsyncDataReader(object):
     def set_transformers(self, transformers):
         self._transformers = transformers
 
-    def recycle(self, *args):
-        for shared_ndarray in args:
-            if not isinstance(shared_ndarray, SharedNDArray):
-                raise Value("Only support recycle SharedNDArray object.")
-            shared_ndarray.recycle(self._pool_manager.pool)
-
-    def _start_async_processing(self):
+    def _sample_generator(self):
         sample_info_queue = self._manager.Queue(self._sample_info_buffer_size)
         sample_queue = self._manager.Queue(self._sample_buffer_size)
         self._order_id = 0
 
         @suppress_complaints(verbose=self._verbose, notify=self._force_exit)
         def ordered_feeding_task(sample_info_queue):
-            if self._verbose == 0:
-                signal.signal(signal.SIGTERM, suppress_signal)
-                signal.signal(signal.SIGINT, suppress_signal)
-
             for sample_info_bucket in self._bucket_list:
                 try:
                     sample_info_list = \
@@ -282,12 +287,13 @@ class AsyncDataReader(object):
                         sample_info_queue.put((sample_info, self._order_id))
                         self._order_id += 1
 
-            for i in xrange(self._sample_proc_num):
+            for i in xrange(self._proc_num):
                 sample_info_queue.put(EpochEndSignal())
 
-        feeding_proc = DaemonProcessGroup(
-            proc_num=1, target=ordered_feeding_task, args=(sample_info_queue, ))
-        feeding_proc.start_all()
+        feeding_thread = Thread(
+            target=ordered_feeding_task, args=(sample_info_queue, ))
+        feeding_thread.daemon = True
+        feeding_thread.start()
 
         @suppress_complaints(verbose=self._verbose, notify=self._force_exit)
         def ordered_processing_task(sample_info_queue, sample_queue, out_order):
@@ -315,25 +321,32 @@ class AsyncDataReader(object):
                                            sample_info.feature_size)
 
                 assert sample_info.feature_frame_num \
-                       * sample_info.feature_dim * 4 == len(feature_bytes), \
-                       (sample_info.feature_bin_path,
-                        sample_info.feature_frame_num,
-                        sample_info.feature_dim,
-                        len(feature_bytes))
-
-                label_bytes = read_bytes(sample_info.label_bin_path,
-                                         sample_info.label_start,
-                                         sample_info.label_size)
-
-                assert sample_info.label_frame_num * 4 == len(label_bytes), (
-                    sample_info.label_bin_path, sample_info.label_array,
-                    len(label_bytes))
-
-                label_array = struct.unpack('I' * sample_info.label_frame_num,
-                                            label_bytes)
-                label_data = np.array(
-                    label_array, dtype='int64').reshape(
-                        (sample_info.label_frame_num, 1))
+                       * sample_info.feature_dim * 4 \
+                        == len(feature_bytes), \
+                        (sample_info.feature_bin_path,
+                         sample_info.feature_frame_num,
+                         sample_info.feature_dim,
+                         len(feature_bytes))
+
+                label_data = None
+                if sample_info.label_bin_path != "":
+                    label_bytes = read_bytes(sample_info.label_bin_path,
+                                             sample_info.label_start,
+                                             sample_info.label_size)
+
+                    assert sample_info.label_frame_num * 4 == len(
+                        label_bytes), (sample_info.label_bin_path,
+                                       sample_info.label_array,
+                                       len(label_bytes))
+
+                    label_array = struct.unpack(
+                        'I' * sample_info.label_frame_num, label_bytes)
+                    label_data = np.array(
+                        label_array, dtype='int64').reshape(
+                            (sample_info.label_frame_num, 1))
+                else:
+                    label_data = np.zeros(
+                        (sample_info.label_frame_num, 1), dtype='int64')
 
                 feature_frame_num = sample_info.feature_frame_num
                 feature_dim = sample_info.feature_dim
@@ -343,12 +356,11 @@ class AsyncDataReader(object):
                 feature_data = np.array(
                     feature_array, dtype='float32').reshape((
                         sample_info.feature_frame_num, sample_info.feature_dim))
-
-                sample_data = (feature_data, label_data)
+                sample_data = (feature_data, label_data,
+                               sample_info.sample_name)
                 for transformer in self._transformers:
                     # @TODO(pkuyym) to make transfomer only accept feature_data
                     sample_data = transformer.perform_trans(sample_data)
-
                 while order_id != out_order[0]:
                     time.sleep(0.001)
 
@@ -364,71 +376,77 @@ class AsyncDataReader(object):
 
         out_order = self._manager.list([0])
         args = (sample_info_queue, sample_queue, out_order)
-        sample_proc = DaemonProcessGroup(
-            proc_num=self._sample_proc_num,
-            target=ordered_processing_task,
-            args=args)
-        sample_proc.start_all()
+        workers = [
+            Process(
+                target=ordered_processing_task, args=args)
+            for _ in xrange(self._proc_num)
+        ]
 
-        return sample_queue
+        for w in workers:
+            w.daemon = True
+            w.start()
 
-    def batch_iterator(self, batch_size, minimum_batch_size):
-        @suppress_complaints(verbose=self._verbose, notify=self._force_exit)
-        def batch_assembling_task(sample_queue, batch_queue, pool):
-            def conv_to_shared(ndarray):
-                while self._force_exit == False:
-                    try:
-                        (name, shared_ndarray) = pool.popitem()
-                    except Exception as e:
-                        time.sleep(0.001)
+        finished_proc_num = 0
+
+        while self._force_exit == False:
+            try:
+                sample = sample_queue.get_nowait()
+            except Queue.Empty:
+                time.sleep(0.001)
+            else:
+                if isinstance(sample, EpochEndSignal):
+                    finished_proc_num += 1
+                    if finished_proc_num >= self._proc_num:
+                        break
                     else:
-                        shared_ndarray.copy(ndarray)
-                        return shared_ndarray
+                        continue
 
-            if self._verbose == 0:
-                signal.signal(signal.SIGTERM, suppress_signal)
-                signal.signal(signal.SIGINT, suppress_signal)
+                yield sample
 
+    def batch_iterator(self, batch_size, minimum_batch_size):
+        def batch_to_ndarray(batch_samples, lod):
+            assert len(batch_samples)
+            frame_dim = batch_samples[0][0].shape[1]
+            batch_feature = np.zeros((lod[-1], frame_dim), dtype="float32")
+            batch_label = np.zeros((lod[-1], 1), dtype="int64")
+            start = 0
+            name_lst = []
+            for sample in batch_samples:
+                frame_num = sample[0].shape[0]
+                batch_feature[start:start + frame_num, :] = sample[0]
+                batch_label[start:start + frame_num, :] = sample[1]
+                start += frame_num
+                name_lst.append(sample[2])
+            return (batch_feature, batch_label, name_lst)
+
+        @suppress_complaints(verbose=self._verbose, notify=self._force_exit)
+        def batch_assembling_task(sample_generator, batch_queue):
             batch_samples = []
             lod = [0]
-            done_num = 0
-            while done_num < self._sample_proc_num:
-                sample = sample_queue.get()
-                if isinstance(sample, EpochEndSignal):
-                    done_num += 1
-                else:
-                    batch_samples.append(sample)
-                    lod.append(lod[-1] + sample[0].shape[0])
-                    if len(batch_samples) == batch_size:
-                        feature, label = batch_to_ndarray(batch_samples, lod)
-
-                        feature = conv_to_shared(feature)
-                        label = conv_to_shared(label)
-                        lod = conv_to_shared(np.array(lod).astype('int64'))
-
-                        batch_queue.put((feature, label, lod))
-                        batch_samples = []
-                        lod = [0]
+            for sample in sample_generator():
+                batch_samples.append(sample)
+                lod.append(lod[-1] + sample[0].shape[0])
+                if len(batch_samples) == batch_size:
+                    (batch_feature, batch_label, name_lst) = batch_to_ndarray(
+                        batch_samples, lod)
+                    batch_queue.put((batch_feature, batch_label, lod, name_lst))
+                    batch_samples = []
+                    lod = [0]
 
             if len(batch_samples) >= minimum_batch_size:
-                (feature, label) = batch_to_ndarray(batch_samples, lod)
-
-                feature = conv_to_shared(feature)
-                label = conv_to_shared(label)
-                lod = conv_to_shared(np.array(lod).astype('int64'))
-
-                batch_queue.put((feature, label, lod))
+                (batch_feature, batch_label, name_lst) = batch_to_ndarray(
+                    batch_samples, lod)
+                batch_queue.put((batch_feature, batch_label, lod, name_lst))
 
             batch_queue.put(EpochEndSignal())
 
-        sample_queue = self._start_async_processing()
-        batch_queue = self._manager.Queue(self._batch_buffer_size)
+        batch_queue = Queue.Queue(self._batch_buffer_size)
 
-        assembling_proc = DaemonProcessGroup(
-            proc_num=1,
+        assembling_thread = Thread(
             target=batch_assembling_task,
-            args=(sample_queue, batch_queue, self._pool_manager.pool))
-        assembling_proc.start_all()
+            args=(self._sample_generator, batch_queue))
+        assembling_thread.daemon = True
+        assembling_thread.start()
 
         while self._force_exit == False:
             try:
diff --git a/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py b/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py
index 157ab02eee0093fe5d683e642b3d18d842cb4e19..9f76a9f8590d5f148398c4ffaff77dc95421df83 100644
--- a/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py
+++ b/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py
@@ -22,7 +22,7 @@ class TestTransMeanVarianceNorm(unittest.TestCase):
         feature = np.zeros((2, 120), dtype="float32")
         feature.fill(1)
         trans = trans_mean_variance_norm.TransMeanVarianceNorm(self._file_path)
-        (feature1, label1) = trans.perform_trans((feature, None))
+        (feature1, label1, name) = trans.perform_trans((feature, None, None))
         (mean, var) = trans.get_mean_var()
         feature_flat1 = feature1.flatten()
         feature_flat = feature.flatten()
@@ -70,7 +70,7 @@ class TestTransAddDelta(unittest.TestCase):
         feature[2, 0:40].fill(3)
         feature[3, 0:40].fill(4)
         trans = trans_add_delta.TransAddDelta()
-        (feature, label) = trans.perform_trans((feature, None))
+        (feature, label, name) = trans.perform_trans((feature, None, None))
         self.assertAlmostEqual(feature.shape[0], 4)
         self.assertAlmostEqual(feature.shape[1], 120)
         self.assertAlmostEqual(1.0, feature[0][0])
@@ -93,7 +93,7 @@ class TestTransSplict(unittest.TestCase):
             feature[i, :].fill(i)
 
         trans = trans_splice.TransSplice()
-        (feature, label) = trans.perform_trans((feature, None))
+        (feature, label, name) = trans.perform_trans((feature, None, None))
         self.assertEqual(feature.shape[1], 110)
 
         for i in xrange(8):
diff --git a/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py b/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py
index dc1a4fa45be38152eba773c35e67d0ad3e4a13cb..aa8062f87c932b76dd8a79db825d07e8be273857 100644
--- a/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py
+++ b/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py
@@ -32,9 +32,9 @@ class TransAddDelta(object):
             Args: 
                 sample(object,tuple): contain feature numpy and label numpy
             Returns:
-                (feature, label)
+                (feature, label, name)
         """
-        (feature, label) = sample
+        (feature, label, name) = sample
         frame_dim = feature.shape[1]
         d_frame_dim = frame_dim * 3
         head_filled = 5
@@ -64,7 +64,7 @@ class TransAddDelta(object):
                       start * d_frame_dim + 2 * frame_dim, frame_dim, nframe,
                       d_frame_dim)
         mat.shape = tmp_shape
-        return (mat[head_filled:mat.shape[0] - tail_filled, :], label)
+        return (mat[head_filled:mat.shape[0] - tail_filled, :], label, name)
 
     def _regress(self, data_in, start_in, data_out, start_out, size, n, step):
         """ regress
diff --git a/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py b/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py
index 5b541d426c61364639f7a9d9f50bd51a2c06efa5..9f91b726ea2bcd432340cd06a3cb9006cd5f83f4 100644
--- a/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py
+++ b/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py
@@ -53,9 +53,9 @@ class TransMeanVarianceNorm(object):
             Args:
                 sample(object):input sample, contain feature numpy and label numpy
             Returns:
-                (feature, label)
+                (feature, label, name)
         """
-        (feature, label) = sample
+        (feature, label, name) = sample
         shape = feature.shape
         assert len(shape) == 2
         nfeature_len = shape[0] * shape[1]
@@ -68,4 +68,4 @@ class TransMeanVarianceNorm(object):
             feature[ncur_idx:ncur_idx + self._nLen] = block
             ncur_idx += self._nLen
         feature = feature.reshape(shape)
-        return (feature, label)
+        return (feature, label, name)
diff --git a/fluid/DeepASR/data_utils/augmentor/trans_splice.py b/fluid/DeepASR/data_utils/augmentor/trans_splice.py
index 94f5258de316045d41999b26c6963f8487e9c55a..1fab3d6b442c1613f18d16fd0b0ee89464dbeb2c 100644
--- a/fluid/DeepASR/data_utils/augmentor/trans_splice.py
+++ b/fluid/DeepASR/data_utils/augmentor/trans_splice.py
@@ -30,9 +30,9 @@ class TransSplice(object):
         Args:
             sample(object): input sample(feature, label)
         Return:
-            (feature, label)
+            (feature, label, name)
         """
-        (feature, label) = sample
+        (feature, label, name) = sample
         nframe_num = feature.shape[0]
         nframe_dim = feature.shape[1]
         nnew_frame_dim = nframe_dim * (
@@ -61,4 +61,4 @@ class TransSplice(object):
             np.copyto(ret[i * nnew_frame_dim:(i + 1) * nnew_frame_dim],
                       mat[i * nframe_dim:i * nframe_dim + nnew_frame_dim])
         ret = ret.reshape((nframe_num, nnew_frame_dim))
-        return (ret, label)
+        return (ret, label, name)
diff --git a/fluid/DeepASR/data_utils/util.py b/fluid/DeepASR/data_utils/util.py
index e8ccbadc0bf2106ccabd73a449eb5e53983ccf95..0a48f4696547377dbe89934355e8eaac38966fab 100644
--- a/fluid/DeepASR/data_utils/util.py
+++ b/fluid/DeepASR/data_utils/util.py
@@ -4,8 +4,6 @@ from __future__ import print_function
 import sys
 from six import reraise
 from tblib import Traceback
-from multiprocessing import Manager, Process
-import posix_ipc, mmap
 
 import numpy as np
 
@@ -37,19 +35,6 @@ def lodtensor_to_ndarray(lod_tensor):
     return ret, lod_tensor.lod()
 
 
-def batch_to_ndarray(batch_samples, lod):
-    frame_dim = batch_samples[0][0].shape[1]
-    batch_feature = np.zeros((lod[-1], frame_dim), dtype="float32")
-    batch_label = np.zeros((lod[-1], 1), dtype="int64")
-    start = 0
-    for sample in batch_samples:
-        frame_num = sample[0].shape[0]
-        batch_feature[start:start + frame_num, :] = sample[0]
-        batch_label[start:start + frame_num, :] = sample[1]
-        start += frame_num
-    return (batch_feature, batch_label)
-
-
 def split_infer_result(infer_seq, lod):
     infer_batch = []
     for i in xrange(0, len(lod[0]) - 1):
@@ -57,126 +42,10 @@ def split_infer_result(infer_seq, lod):
     return infer_batch
 
 
-class DaemonProcessGroup(object):
-    def __init__(self, proc_num, target, args):
-        self._proc_num = proc_num
-        self._workers = [
-            Process(
-                target=target, args=args) for _ in xrange(self._proc_num)
-        ]
-
-    def start_all(self):
-        for w in self._workers:
-            w.daemon = True
-            w.start()
-
-    @property
-    def proc_num(self):
-        return self._proc_num
-
-
-class EpochEndSignal(object):
-    pass
-
-
 class CriticalException(Exception):
     pass
 
 
-class SharedNDArray(object):
-    """SharedNDArray utilizes shared memory to avoid data serialization when
-    data object shared among different processes. We can reconstruct the
-    `ndarray` when memory address, shape and dtype provided.
-
-    Args:
-        name (str): Address name of shared memory.
-        whether_verify (bool): Whether to validate the writing operation.
-    """
-
-    def __init__(self, name, whether_verify=False):
-        self._name = name
-        self._shm = None
-        self._buf = None
-        self._array = np.zeros(1, dtype=np.float32)
-        self._inited = False
-        self._whether_verify = whether_verify
-
-    def zeros_like(self, shape, dtype):
-        size = int(np.prod(shape)) * np.dtype(dtype).itemsize
-        if self._inited:
-            self._shm = posix_ipc.SharedMemory(self._name)
-        else:
-            self._shm = posix_ipc.SharedMemory(
-                self._name, posix_ipc.O_CREAT, size=size)
-        self._buf = mmap.mmap(self._shm.fd, size)
-        self._array = np.ndarray(shape, dtype, self._buf, order='C')
-
-    def copy(self, ndarray):
-        size = int(np.prod(ndarray.shape)) * np.dtype(ndarray.dtype).itemsize
-        self.zeros_like(ndarray.shape, ndarray.dtype)
-        self._array[:] = ndarray
-        self._buf.flush()
-        self._inited = True
-
-        if self._whether_verify:
-            shm = posix_ipc.SharedMemory(self._name)
-            buf = mmap.mmap(shm.fd, size)
-            array = np.ndarray(ndarray.shape, ndarray.dtype, buf, order='C')
-            np.testing.assert_array_equal(array, ndarray)
-
-    @property
-    def ndarray(self):
-        return self._array
-
-    def recycle(self, pool):
-        self._buf.close()
-        self._shm.close_fd()
-        self._inited = False
-        pool[self._name] = self
-
-    def __getstate__(self):
-        return (self._name, self._array.shape, self._array.dtype, self._inited,
-                self._whether_verify)
-
-    def __setstate__(self, state):
-        self._name = state[0]
-        self._inited = state[3]
-        self.zeros_like(state[1], state[2])
-        self._whether_verify = state[4]
-
-
-class SharedMemoryPoolManager(object):
-    """SharedMemoryPoolManager maintains a multiprocessing.Manager.dict object.
-    All available addresses are allocated once and will be reused. Though this
-    class is not process-safe, the pool can be shared between processes. All
-    shared memory should be unlinked before the main process exited.
-
-    Args:
-        pool_size (int): Size of shared memory pool.
-        manager (dict): A multiprocessing.Manager object, the pool is
-                        maintained by the proxy process.
-        name_prefix (str): Address prefix of shared memory.
-    """
-
-    def __init__(self, pool_size, manager, name_prefix='/deep_asr'):
-        self._names = []
-        self._dict = manager.dict()
-
-        for i in xrange(pool_size):
-            name = name_prefix + '_' + str(i)
-            self._dict[name] = SharedNDArray(name)
-            self._names.append(name)
-
-    @property
-    def pool(self):
-        return self._dict
-
-    def __del__(self):
-        for name in self._names:
-            # have to unlink the shared memory
-            posix_ipc.unlink_shared_memory(name)
-
-
 def suppress_signal(signo, stack_frame):
     pass
 
diff --git a/fluid/DeepASR/decoder/decoder.cc b/fluid/DeepASR/decoder/decoder.cc
deleted file mode 100644
index a99f972e2fc2341247eb2a6aa564d8d6b5e2905d..0000000000000000000000000000000000000000
--- a/fluid/DeepASR/decoder/decoder.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "decoder.h"
-
-std::string decode(std::vector<std::vector<float>> probs_mat) {
-  // Add decoding logic here
-
-  return "example decoding result";
-}
diff --git a/fluid/DeepASR/decoder/decoder.h b/fluid/DeepASR/decoder/decoder.h
deleted file mode 100644
index 4a67fa366ae31dd393d0e3b2f04d27e360596fe5..0000000000000000000000000000000000000000
--- a/fluid/DeepASR/decoder/decoder.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-std::string decode(std::vector<std::vector<float>> probs_mat);
diff --git a/fluid/DeepASR/decoder/post_decode_faster.cc b/fluid/DeepASR/decoder/post_decode_faster.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce2b45bc6cecec5466f3d20841e5b8ba38151a6c
--- /dev/null
+++ b/fluid/DeepASR/decoder/post_decode_faster.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "post_decode_faster.h"
+
+typedef kaldi::int32 int32;
+using fst::SymbolTable;
+using fst::VectorFst;
+using fst::StdArc;
+
+Decoder::Decoder(std::string word_syms_filename,
+                 std::string fst_in_filename,
+                 std::string logprior_rxfilename,
+                 kaldi::BaseFloat acoustic_scale) {
+  const char* usage =
+      "Decode, reading log-likelihoods (of transition-ids or whatever symbol "
+      "is on the graph) as matrices.";
+
+  kaldi::ParseOptions po(usage);
+  binary = true;
+  this->acoustic_scale = acoustic_scale;
+  allow_partial = true;
+  kaldi::FasterDecoderOptions decoder_opts;
+  decoder_opts.Register(&po, true);  // true == include obscure settings.
+  po.Register("binary", &binary, "Write output in binary mode");
+  po.Register("allow-partial",
+              &allow_partial,
+              "Produce output even when final state was not reached");
+  po.Register("acoustic-scale",
+              &acoustic_scale,
+              "Scaling factor for acoustic likelihoods");
+
+  word_syms = NULL;
+  if (word_syms_filename != "") {
+    word_syms = fst::SymbolTable::ReadText(word_syms_filename);
+    if (!word_syms)
+      KALDI_ERR << "Could not read symbol table from file "
+                << word_syms_filename;
+  }
+
+  std::ifstream is_logprior(logprior_rxfilename);
+  logprior.Read(is_logprior, false);
+
+  // It's important that we initialize decode_fst after loglikes_reader, as it
+  // can prevent crashes on systems installed without enough virtual memory.
+  // It has to do with what happens on UNIX systems if you call fork() on a
+  // large process: the page-table entries are duplicated, which requires a
+  // lot of virtual memory.
+  decode_fst = fst::ReadFstKaldi(fst_in_filename);
+
+  decoder = new kaldi::FasterDecoder(*decode_fst, decoder_opts);
+}
+
+
+Decoder::~Decoder() {
+  if (!word_syms) delete word_syms;
+  delete decode_fst;
+  delete decoder;
+}
+
+std::string Decoder::decode(
+    std::string key,
+    const std::vector<std::vector<kaldi::BaseFloat>>& log_probs) {
+  size_t num_frames = log_probs.size();
+  size_t dim_label = log_probs[0].size();
+
+  kaldi::Matrix<kaldi::BaseFloat> loglikes(
+      num_frames, dim_label, kaldi::kSetZero, kaldi::kStrideEqualNumCols);
+  for (size_t i = 0; i < num_frames; ++i) {
+    memcpy(loglikes.Data() + i * dim_label,
+           log_probs[i].data(),
+           sizeof(kaldi::BaseFloat) * dim_label);
+  }
+
+  return decode(key, loglikes);
+}
+
+
+std::vector<std::string> Decoder::decode(std::string posterior_rspecifier) {
+  kaldi::SequentialBaseFloatMatrixReader posterior_reader(posterior_rspecifier);
+  std::vector<std::string> decoding_results;
+
+  for (; !posterior_reader.Done(); posterior_reader.Next()) {
+    std::string key = posterior_reader.Key();
+    kaldi::Matrix<kaldi::BaseFloat> loglikes(posterior_reader.Value());
+
+    decoding_results.push_back(decode(key, loglikes));
+  }
+
+  return decoding_results;
+}
+
+
+std::string Decoder::decode(std::string key,
+                            kaldi::Matrix<kaldi::BaseFloat>& loglikes) {
+  std::string decoding_result;
+
+  if (loglikes.NumRows() == 0) {
+    KALDI_WARN << "Zero-length utterance: " << key;
+  }
+  KALDI_ASSERT(loglikes.NumCols() == logprior.Dim());
+
+  loglikes.ApplyLog();
+  loglikes.AddVecToRows(-1.0, logprior);
+
+  kaldi::DecodableMatrixScaled decodable(loglikes, acoustic_scale);
+  decoder->Decode(&decodable);
+
+  VectorFst<kaldi::LatticeArc> decoded;  // linear FST.
+
+  if ((allow_partial || decoder->ReachedFinal()) &&
+      decoder->GetBestPath(&decoded)) {
+    if (!decoder->ReachedFinal())
+      KALDI_WARN << "Decoder did not reach end-state, outputting partial "
+                    "traceback.";
+
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    kaldi::LatticeWeight weight;
+
+    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
+
+    if (word_syms != NULL) {
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        decoding_result += s;
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
+      }
+    }
+  }
+
+  return decoding_result;
+}
diff --git a/fluid/DeepASR/decoder/post_decode_faster.h b/fluid/DeepASR/decoder/post_decode_faster.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bade8d6988f02ef4caab8ecf6fc50209aa3642a
--- /dev/null
+++ b/fluid/DeepASR/decoder/post_decode_faster.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "base/kaldi-common.h"
+#include "base/timer.h"
+#include "decoder/decodable-matrix.h"
+#include "decoder/faster-decoder.h"
+#include "fstext/fstext-lib.h"
+#include "hmm/transition-model.h"
+#include "lat/kaldi-lattice.h"  // for {Compact}LatticeArc
+#include "tree/context-dep.h"
+#include "util/common-utils.h"
+
+
+class Decoder {
+public:
+  Decoder(std::string word_syms_filename,
+          std::string fst_in_filename,
+          std::string logprior_rxfilename,
+          kaldi::BaseFloat acoustic_scale);
+  ~Decoder();
+
+  // Interface to accept the scores read from specifier and return
+  // the batch decoding results
+  std::vector<std::string> decode(std::string posterior_rspecifier);
+
+  // Accept the scores of one utterance and return the decoding result
+  std::string decode(
+      std::string key,
+      const std::vector<std::vector<kaldi::BaseFloat>> &log_probs);
+
+private:
+  // For decoding one utterance
+  std::string decode(std::string key,
+                     kaldi::Matrix<kaldi::BaseFloat> &loglikes);
+
+  fst::SymbolTable *word_syms;
+  fst::VectorFst<fst::StdArc> *decode_fst;
+  kaldi::FasterDecoder *decoder;
+  kaldi::Vector<kaldi::BaseFloat> logprior;
+
+  bool binary;
+  kaldi::BaseFloat acoustic_scale;
+  bool allow_partial;
+};
diff --git a/fluid/DeepASR/decoder/pybind.cc b/fluid/DeepASR/decoder/pybind.cc
index 8cd65903eae63268d9f4412bd737162c639d8910..90ea38ffb535677dc66d74fc64ff3fe4a27bf824 100644
--- a/fluid/DeepASR/decoder/pybind.cc
+++ b/fluid/DeepASR/decoder/pybind.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,15 +15,25 @@ limitations under the License. */
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "decoder.h"
+#include "post_decode_faster.h"
 
 namespace py = pybind11;
 
-PYBIND11_MODULE(decoder, m) {
-  m.doc() = "Decode function for Deep ASR model";
-
-  m.def("decode",
-        &decode,
-        "Decode one input probability matrix "
-        "and return the transcription");
+PYBIND11_MODULE(post_decode_faster, m) {
+  m.doc() = "Decoder for Deep ASR model";
+
+  py::class_<Decoder>(m, "Decoder")
+      .def(py::init<std::string, std::string, std::string, kaldi::BaseFloat>())
+      .def("decode",
+           (std::vector<std::string> (Decoder::*)(std::string)) &
+               Decoder::decode,
+           "Decode for the probability matrices in specifier "
+           "and return the transcriptions.")
+      .def(
+          "decode",
+          (std::string (Decoder::*)(
+              std::string, const std::vector<std::vector<kaldi::BaseFloat>>&)) &
+              Decoder::decode,
+          "Decode one input probability matrix "
+          "and return the transcription.");
 }
diff --git a/fluid/DeepASR/decoder/setup.py b/fluid/DeepASR/decoder/setup.py
index cedd5d644e0dc1ca8855ab7e75ee1b7a30f8fcb1..a98c0b4cc17717a6769b8322e4f5afe3de6ab2de 100644
--- a/fluid/DeepASR/decoder/setup.py
+++ b/fluid/DeepASR/decoder/setup.py
@@ -1,4 +1,4 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,27 +13,57 @@
 # limitations under the License.
 
 import os
+import glob
 from distutils.core import setup, Extension
 from distutils.sysconfig import get_config_vars
 
-args = ['-std=c++11']
+try:
+    kaldi_root = os.environ['KALDI_ROOT']
+except:
+    raise ValueError("Enviroment variable 'KALDI_ROOT' is not defined. Please "
+                     "install kaldi and export KALDI_ROOT=<kaldi's root dir> .")
+
+args = [
+    '-std=c++11', '-Wno-sign-compare', '-Wno-unused-variable',
+    '-Wno-unused-local-typedefs', '-Wno-unused-but-set-variable',
+    '-Wno-deprecated-declarations', '-Wno-unused-function'
+]
 
 # remove warning about -Wstrict-prototypes
 (opt, ) = get_config_vars('OPT')
 os.environ['OPT'] = " ".join(flag for flag in opt.split()
                              if flag != '-Wstrict-prototypes')
+os.environ['CC'] = 'g++'
+
+LIBS = [
+    'fst', 'kaldi-base', 'kaldi-util', 'kaldi-matrix', 'kaldi-tree',
+    'kaldi-hmm', 'kaldi-fstext', 'kaldi-decoder', 'kaldi-lat'
+]
+
+LIB_DIRS = [
+    'tools/openfst/lib', 'src/base', 'src/matrix', 'src/util', 'src/tree',
+    'src/hmm', 'src/fstext', 'src/decoder', 'src/lat'
+]
+LIB_DIRS = [os.path.join(kaldi_root, path) for path in LIB_DIRS]
+LIB_DIRS = [os.path.abspath(path) for path in LIB_DIRS]
 
 ext_modules = [
     Extension(
-        'decoder',
-        ['pybind.cc', 'decoder.cc'],
-        include_dirs=['pybind11/include', '.'],
+        'post_decode_faster',
+        ['pybind.cc', 'post_decode_faster.cc'],
+        include_dirs=[
+            'pybind11/include', '.', os.path.join(kaldi_root, 'src'),
+            os.path.join(kaldi_root, 'tools/openfst/src/include')
+        ],
         language='c++',
+        libraries=LIBS,
+        library_dirs=LIB_DIRS,
+        runtime_library_dirs=LIB_DIRS,
         extra_compile_args=args, ),
 ]
 
 setup(
-    name='decoder',
+    name='post_decode_faster',
     version='0.0.1',
     author='Paddle',
     author_email='',
diff --git a/fluid/DeepASR/decoder/setup.sh b/fluid/DeepASR/decoder/setup.sh
index 71fd6626efe1b7cf72a1e678ab7b74000ebfb8c3..1471f85f414ae8dd5230f04cf08da282adc3b0b7 100644
--- a/fluid/DeepASR/decoder/setup.sh
+++ b/fluid/DeepASR/decoder/setup.sh
@@ -1,4 +1,4 @@
-
+set -e
 
 if [ ! -d pybind11 ]; then
     git clone https://github.com/pybind/pybind11.git
diff --git a/fluid/DeepASR/infer.py b/fluid/DeepASR/infer.py
index babcb416ea884081ae249a8d1dc177f85cf1c9ba..84269261a95c381a9be21425abf43b98006f0886 100644
--- a/fluid/DeepASR/infer.py
+++ b/fluid/DeepASR/infer.py
@@ -8,7 +8,7 @@ import paddle.fluid as fluid
 import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
 import data_utils.augmentor.trans_add_delta as trans_add_delta
 import data_utils.augmentor.trans_splice as trans_splice
-import data_utils.data_reader as reader
+import data_utils.async_data_reader as reader
 from data_utils.util import lodtensor_to_ndarray
 from data_utils.util import split_infer_result
 
@@ -79,12 +79,13 @@ def infer(args):
         trans_splice.TransSplice()
     ]
 
-    infer_data_reader = reader.DataReader(args.infer_feature_lst,
-                                          args.infer_label_lst)
+    infer_data_reader = reader.AsyncDataReader(args.infer_feature_lst,
+                                               args.infer_label_lst)
     infer_data_reader.set_transformers(ltrans)
 
     feature_t = fluid.LoDTensor()
     one_batch = infer_data_reader.batch_iterator(args.batch_size, 1).next()
+
     (features, labels, lod) = one_batch
     feature_t.set(features, place)
     feature_t.set_lod([lod])
diff --git a/fluid/DeepASR/infer_by_ckpt.py b/fluid/DeepASR/infer_by_ckpt.py
index 68dd573647d498704fd22f70a7df2255e7ac66cd..bf6093acb8e14ec926d1aefb759207905e468f8d 100644
--- a/fluid/DeepASR/infer_by_ckpt.py
+++ b/fluid/DeepASR/infer_by_ckpt.py
@@ -13,7 +13,7 @@ import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
 import data_utils.augmentor.trans_add_delta as trans_add_delta
 import data_utils.augmentor.trans_splice as trans_splice
 import data_utils.async_data_reader as reader
-import decoder.decoder as decoder
+from decoder.post_decode_faster import Decoder
 from data_utils.util import lodtensor_to_ndarray
 from model_utils.model import stacked_lstmp_model
 from data_utils.util import split_infer_result
@@ -32,6 +32,11 @@ def parse_args():
         default=1,
         help='The minimum sequence number of a batch data. '
         '(default: %(default)d)')
+    parser.add_argument(
+        '--frame_dim',
+        type=int,
+        default=120 * 11,
+        help='Frame dimension of feature data. (default: %(default)d)')
     parser.add_argument(
         '--stacked_num',
         type=int,
@@ -47,6 +52,11 @@ def parse_args():
         type=int,
         default=1024,
         help='Hidden size of lstmp unit. (default: %(default)d)')
+    parser.add_argument(
+        '--class_num',
+        type=int,
+        default=1749,
+        help='Number of classes in label. (default: %(default)d)')
     parser.add_argument(
         '--learning_rate',
         type=float,
@@ -81,6 +91,26 @@ def parse_args():
         type=str,
         default='./checkpoint',
         help="The checkpoint path to init model. (default: %(default)s)")
+    parser.add_argument(
+        '--vocabulary',
+        type=str,
+        default='./decoder/graph/words.txt',
+        help="The path to vocabulary. (default: %(default)s)")
+    parser.add_argument(
+        '--graphs',
+        type=str,
+        default='./decoder/graph/TLG.fst',
+        help="The path to TLG graphs for decoding. (default: %(default)s)")
+    parser.add_argument(
+        '--log_prior',
+        type=str,
+        default="./decoder/logprior",
+        help="The log prior probs for training data. (default: %(default)s)")
+    parser.add_argument(
+        '--acoustic_scale',
+        type=float,
+        default=0.2,
+        help="Scaling factor for acoustic likelihoods. (default: %(default)f)")
     args = parser.parse_args()
     return args
 
@@ -99,10 +129,11 @@ def infer_from_ckpt(args):
         raise IOError("Invalid checkpoint!")
 
     prediction, avg_cost, accuracy = stacked_lstmp_model(
+        frame_dim=args.frame_dim,
         hidden_dim=args.hidden_dim,
         proj_dim=args.proj_dim,
         stacked_num=args.stacked_num,
-        class_num=1749,
+        class_num=args.class_num,
         parallel=args.parallel)
 
     infer_program = fluid.default_main_program().clone()
@@ -117,6 +148,10 @@ def infer_from_ckpt(args):
     # load checkpoint.
     fluid.io.load_persistables(exe, args.checkpoint)
 
+    # init decoder
+    decoder = Decoder(args.vocabulary, args.graphs, args.log_prior,
+                      args.acoustic_scale)
+
     ltrans = [
         trans_add_delta.TransAddDelta(2, 2),
         trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var),
@@ -136,12 +171,10 @@ def infer_from_ckpt(args):
                                              args.minimum_batch_size)):
         # load_data
         (features, labels, lod) = batch_data
-        feature_t.set(features.ndarray, place)
-        feature_t.set_lod([lod.ndarray])
-        label_t.set(labels.ndarray, place)
-        label_t.set_lod([lod.ndarray])
-
-        infer_data_reader.recycle(features, labels, lod)
+        feature_t.set(features, place)
+        feature_t.set_lod([lod])
+        label_t.set(labels, place)
+        label_t.set_lod([lod])
 
         results = exe.run(infer_program,
                           feed={"feature": feature_t,
@@ -154,8 +187,8 @@ def infer_from_ckpt(args):
         probs, lod = lodtensor_to_ndarray(results[0])
         infer_batch = split_infer_result(probs, lod)
         for index, sample in enumerate(infer_batch):
-            print("Decoding %d: " % (batch_id * args.batch_size + index),
-                  decoder.decode(sample))
+            key = "utter#%d" % (batch_id * args.batch_size + index)
+            print(key, ": ", decoder.decode(key, sample).encode("utf8"), "\n")
 
     print(np.mean(infer_costs), np.mean(infer_accs))
 
diff --git a/fluid/DeepASR/model_utils/model.py b/fluid/DeepASR/model_utils/model.py
index 541f869c7224e620c519c97472dbe79ca73bd84b..8fb7596e122447979cf392d6610ad2b7281d195b 100644
--- a/fluid/DeepASR/model_utils/model.py
+++ b/fluid/DeepASR/model_utils/model.py
@@ -6,7 +6,8 @@ import paddle.v2 as paddle
 import paddle.fluid as fluid
 
 
-def stacked_lstmp_model(hidden_dim,
+def stacked_lstmp_model(frame_dim,
+                        hidden_dim,
                         proj_dim,
                         stacked_num,
                         class_num,
@@ -20,12 +21,13 @@ def stacked_lstmp_model(hidden_dim,
         label data respectively. And in inference, only `feature` is needed.
 
     Args:
-	hidden_dim(int): The hidden state's dimension of the LSTMP layer.
-	proj_dim(int): The projection size of the LSTMP layer.
-	stacked_num(int): The number of stacked LSTMP layers.
-	parallel(bool): Run in parallel or not, default `False`.
-	is_train(bool): Run in training phase or not, default `True`.
-	class_dim(int): The number of output classes.
+        frame_dim(int): The frame dimension of feature data.
+        hidden_dim(int): The hidden state's dimension of the LSTMP layer.
+        proj_dim(int): The projection size of the LSTMP layer.
+        stacked_num(int): The number of stacked LSTMP layers.
+        parallel(bool): Run in parallel or not, default `False`.
+        is_train(bool): Run in training phase or not, default `True`.
+        class_dim(int): The number of output classes.
     """
 
     # network configuration
@@ -78,7 +80,7 @@ def stacked_lstmp_model(hidden_dim,
 
     # data feeder
     feature = fluid.layers.data(
-        name="feature", shape=[-1, 120 * 11], dtype="float32", lod_level=1)
+        name="feature", shape=[-1, frame_dim], dtype="float32", lod_level=1)
     label = fluid.layers.data(
         name="label", shape=[-1, 1], dtype="int64", lod_level=1)
 
@@ -92,11 +94,12 @@ def stacked_lstmp_model(hidden_dim,
             feat_ = pd.read_input(feature)
             label_ = pd.read_input(label)
             prediction, avg_cost, acc = _net_conf(feat_, label_)
-            for out in [avg_cost, acc]:
+            for out in [prediction, avg_cost, acc]:
                 pd.write_output(out)
 
         # get mean loss and acc through every devices.
-        avg_cost, acc = pd()
+        prediction, avg_cost, acc = pd()
+        prediction.stop_gradient = True
         avg_cost = fluid.layers.mean(x=avg_cost)
         acc = fluid.layers.mean(x=acc)
     else:
diff --git a/fluid/DeepASR/tools/profile.py b/fluid/DeepASR/tools/profile.py
index 77dff3cb371659ce672e10735174b846827c9d6b..69aee88e22d33ed80212692bf61e41e1666bf5e5 100644
--- a/fluid/DeepASR/tools/profile.py
+++ b/fluid/DeepASR/tools/profile.py
@@ -31,6 +31,11 @@ def parse_args():
         default=1,
         help='The minimum sequence number of a batch data. '
         '(default: %(default)d)')
+    parser.add_argument(
+        '--frame_dim',
+        type=int,
+        default=120 * 11,
+        help='Frame dimension of feature data. (default: %(default)d)')
     parser.add_argument(
         '--stacked_num',
         type=int,
@@ -46,6 +51,11 @@ def parse_args():
         type=int,
         default=1024,
         help='Hidden size of lstmp unit. (default: %(default)d)')
+    parser.add_argument(
+        '--class_num',
+        type=int,
+        default=1749,
+        help='Number of classes in label. (default: %(default)d)')
     parser.add_argument(
         '--learning_rate',
         type=float,
@@ -119,10 +129,11 @@ def profile(args):
             "arg 'first_batches_to_skip' must not be smaller than 0.")
 
     _, avg_cost, accuracy = stacked_lstmp_model(
+        frame_dim=args.frame_dim,
         hidden_dim=args.hidden_dim,
         proj_dim=args.proj_dim,
         stacked_num=args.stacked_num,
-        class_num=1749,
+        class_num=args.class_num,
         parallel=args.parallel)
 
     optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
@@ -158,14 +169,12 @@ def profile(args):
                 frames_seen = 0
             # load_data
             (features, labels, lod) = batch_data
-            feature_t.set(features.ndarray, place)
-            feature_t.set_lod([lod.ndarray])
-            label_t.set(labels.ndarray, place)
-            label_t.set_lod([lod.ndarray])
-
-            frames_seen += lod.ndarray[-1]
+            feature_t.set(features, place)
+            feature_t.set_lod([lod])
+            label_t.set(labels, place)
+            label_t.set_lod([lod])
 
-            data_reader.recycle(features, labels, lod)
+            frames_seen += lod[-1]
 
             outs = exe.run(fluid.default_main_program(),
                            feed={"feature": feature_t,
diff --git a/fluid/DeepASR/train.py b/fluid/DeepASR/train.py
index 8297ab7403775125b6da372c4d916e5e3111b669..3908a550cdcf095057ea6ab0b89e07dcecda51f9 100644
--- a/fluid/DeepASR/train.py
+++ b/fluid/DeepASR/train.py
@@ -30,6 +30,11 @@ def parse_args():
         default=1,
         help='The minimum sequence number of a batch data. '
         '(default: %(default)d)')
+    parser.add_argument(
+        '--frame_dim',
+        type=int,
+        default=120 * 11,
+        help='Frame dimension of feature data. (default: %(default)d)')
     parser.add_argument(
         '--stacked_num',
         type=int,
@@ -45,6 +50,11 @@ def parse_args():
         type=int,
         default=1024,
         help='Hidden size of lstmp unit. (default: %(default)d)')
+    parser.add_argument(
+        '--class_num',
+        type=int,
+        default=1749,
+        help='Number of classes in label. (default: %(default)d)')
     parser.add_argument(
         '--pass_num',
         type=int,
@@ -137,10 +147,11 @@ def train(args):
         os.mkdir(args.infer_models)
 
     prediction, avg_cost, accuracy = stacked_lstmp_model(
+        frame_dim=args.frame_dim,
         hidden_dim=args.hidden_dim,
         proj_dim=args.proj_dim,
         stacked_num=args.stacked_num,
-        class_num=1749,
+        class_num=args.class_num,
         parallel=args.parallel)
 
     # program for test
@@ -182,12 +193,10 @@ def train(args):
                                                 args.minimum_batch_size)):
             # load_data
             (features, labels, lod) = batch_data
-            feature_t.set(features.ndarray, place)
-            feature_t.set_lod([lod.ndarray])
-            label_t.set(labels.ndarray, place)
-            label_t.set_lod([lod.ndarray])
-
-            test_data_reader.recycle(features, labels, lod)
+            feature_t.set(features, place)
+            feature_t.set_lod([lod])
+            label_t.set(labels, place)
+            label_t.set_lod([lod])
 
             cost, acc = exe.run(test_program,
                                 feed={"feature": feature_t,
@@ -201,6 +210,7 @@ def train(args):
     # train data reader
     train_data_reader = reader.AsyncDataReader(args.train_feature_lst,
                                                args.train_label_lst, -1)
+
     train_data_reader.set_transformers(ltrans)
     # train
     for pass_id in xrange(args.pass_num):
@@ -209,13 +219,11 @@ def train(args):
                 train_data_reader.batch_iterator(args.batch_size,
                                                  args.minimum_batch_size)):
             # load_data
-            (features, labels, lod) = batch_data
-            feature_t.set(features.ndarray, place)
-            feature_t.set_lod([lod.ndarray])
-            label_t.set(labels.ndarray, place)
-            label_t.set_lod([lod.ndarray])
-
-            train_data_reader.recycle(features, labels, lod)
+            (features, labels, lod, name_lst) = batch_data
+            feature_t.set(features, place)
+            feature_t.set_lod([lod])
+            label_t.set(labels, place)
+            label_t.set_lod([lod])
 
             to_print = batch_id > 0 and (batch_id % args.print_per_batches == 0)
             outs = exe.run(fluid.default_main_program(),
diff --git a/fluid/image_classification/caffe2fluid/README.md b/fluid/image_classification/caffe2fluid/README.md
index 279b4c6e57a785736a1c75928de8d45f4e4e956e..5f565afe0c33db291092faeac632da3d51f95613 100644
--- a/fluid/image_classification/caffe2fluid/README.md
+++ b/fluid/image_classification/caffe2fluid/README.md
@@ -2,7 +2,8 @@
 This tool is used to convert a Caffe model to Fluid model
 
 ### Howto
-1, Prepare caffepb.py in ./proto, two options provided
+1, Prepare caffepb.py in ./proto if your python has no 'pycaffe' module, two options provided here:
+
     1) generate it from caffe.proto using protoc
         bash ./proto/compile.sh
 
@@ -12,14 +13,24 @@ This tool is used to convert a Caffe model to Fluid model
 2, Convert the caffe model using 'convert.py' which will generate a python script and a weight(in .npy) file
 
 3, Use the converted model to predict
-    see more detail info in 'tests/lenet/README.md'
+
+    see more detail info in 'examples/xxx'
 
 
-### Supported models
+### Tested models
 - Lenet on mnist dataset
 
 - ResNets:(ResNet-50, ResNet-101, ResNet-152)
-    model addrs:(https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777)
+    model addr: `https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777`_
+
+- GoogleNet:
+    model addr: `https://gist.github.com/jimmie33/7ea9f8ac0da259866b854460f4526034`_
+
+- VGG:
+    model addr: `https://gist.github.com/ksimonyan/211839e770f7b538e2d8`_
+
+- AlexNet:
+    model addr: `https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet`_
 
 ### Notes
 Some of this code come from here: https://github.com/ethereon/caffe-tensorflow
diff --git a/fluid/image_classification/caffe2fluid/convert.py b/fluid/image_classification/caffe2fluid/convert.py
index 68a9e4f7e490a69c1b582d6fc14b2015bfdf9536..379f1a26368c9ffa4a9f82dad499ad7114f942fc 100755
--- a/fluid/image_classification/caffe2fluid/convert.py
+++ b/fluid/image_classification/caffe2fluid/convert.py
@@ -4,8 +4,8 @@ import os
 import sys
 import numpy as np
 import argparse
-from kaffe import KaffeError, print_stderr
 
+from kaffe import KaffeError, print_stderr
 from kaffe.paddle import Transformer
 
 
@@ -47,6 +47,8 @@ def convert(def_path, caffemodel_path, data_output_path, code_output_path,
     except KaffeError as err:
         fatal_error('Error encountered: {}'.format(err))
 
+    return 0
+
 
 def main():
     """ main
@@ -64,9 +66,10 @@ def main():
         help='The phase to convert: test (default) or train')
     args = parser.parse_args()
     validate_arguments(args)
-    convert(args.def_path, args.caffemodel, args.data_output_path,
-            args.code_output_path, args.phase)
+    return convert(args.def_path, args.caffemodel, args.data_output_path,
+                   args.code_output_path, args.phase)
 
 
 if __name__ == '__main__':
-    main()
+    ret = main()
+    sys.exit(ret)
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/README.md b/fluid/image_classification/caffe2fluid/examples/imagenet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b82050859239be8804ddec8e2054edc38c4ac052
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/README.md
@@ -0,0 +1,10 @@
+a demo to show converting caffe models on 'imagenet' using caffe2fluid
+
+---
+
+# How to use
+
+1. prepare python environment
+2. download caffe model to "models.caffe/xxx" which contains "xxx.caffemodel" and "xxx.prototxt"
+3. run the tool
+    eg: bash ./run.sh resnet50 ./models.caffe/resnet50 ./models/resnet50
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/data/65.jpeg b/fluid/image_classification/caffe2fluid/examples/imagenet/data/65.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..fd3a93f59385d6ff632483646e6caee300b56d09
Binary files /dev/null and b/fluid/image_classification/caffe2fluid/examples/imagenet/data/65.jpeg differ
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py b/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec594199be5a3e7a33c9673b1d5497c95f20d946
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py
@@ -0,0 +1,142 @@
+#!/bin/env python
+
+#function:
+#   a demo to show how to use the converted model genereated by caffe2fluid
+#   
+#notes:
+#   only support imagenet data
+
+import os
+import sys
+import inspect
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def load_data(imgfile, shape):
+    h, w = shape[1:]
+    from PIL import Image
+    im = Image.open(imgfile)
+
+    # The storage order of the loaded image is W(widht),
+    # H(height), C(channel). PaddlePaddle requires
+    # the CHW order, so transpose them.
+    im = im.resize((w, h), Image.ANTIALIAS)
+    im = np.array(im).astype(np.float32)
+    im = im.transpose((2, 0, 1))  # CHW
+    im = im[(2, 1, 0), :, :]  # BGR
+
+    # The mean to be subtracted from each image.
+    # By default, the per-channel ImageNet mean.
+    mean = np.array([104., 117., 124.], dtype=np.float32)
+    mean = mean.reshape([3, 1, 1])
+    im = im - mean
+    return im.reshape([1] + shape)
+
+
+def build_model(net_file, net_name):
+    print('build model with net_file[%s] and net_name[%s]' %
+          (net_file, net_name))
+
+    net_path = os.path.dirname(net_file)
+    module_name = os.path.basename(net_file).rstrip('.py')
+    if net_path not in sys.path:
+        sys.path.insert(0, net_path)
+
+    try:
+        m = __import__(module_name, fromlist=[net_name])
+        MyNet = getattr(m, net_name)
+    except Exception as e:
+        print('failed to load module[%s]' % (module_name))
+        print(e)
+        return None
+
+    input_name = 'data'
+    input_shape = MyNet.input_shapes()[input_name]
+    images = fluid.layers.data(name='image', shape=input_shape, dtype='float32')
+    #label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    net = MyNet({input_name: images})
+    input_shape = MyNet.input_shapes()[input_name]
+    return net, input_shape
+
+
+def dump_results(results, names, root):
+    if os.path.exists(root) is False:
+        os.path.mkdir(root)
+
+    for i in range(len(names)):
+        n = names[i]
+        res = results[i]
+        filename = os.path.join(root, n)
+        np.save(filename + '.npy', res)
+
+
+def infer(net_file, net_name, model_file, imgfile, debug=False):
+    """ do inference using a model which consist 'xxx.py' and 'xxx.npy'
+    """
+    #1, build model
+    net, input_shape = build_model(net_file, net_name)
+    prediction = net.get_output()
+
+    #2, load weights for this model
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    startup_program = fluid.default_startup_program()
+    exe.run(startup_program)
+
+    if model_file.find('.npy') > 0:
+        net.load(data_path=model_file, exe=exe, place=place)
+    else:
+        net.load(data_path=model_file, exe=exe)
+
+    #3, test this model
+    test_program = fluid.default_main_program().clone()
+
+    fetch_list_var = []
+    fetch_list_name = []
+    if debug is False:
+        fetch_list_var.append(prediction)
+    else:
+        for k, v in net.layers.items():
+            fetch_list_var.append(v)
+            fetch_list_name.append(k)
+
+    np_images = load_data(imgfile, input_shape)
+    results = exe.run(program=test_program,
+                      feed={'image': np_images},
+                      fetch_list=fetch_list_var)
+
+    if debug is True:
+        dump_path = 'results.layers'
+        dump_results(results, fetch_list_name, dump_path)
+        print('all results dumped to [%s]' % (dump_path))
+    else:
+        result = results[0]
+        print('predicted class:', np.argmax(result))
+
+
+if __name__ == "__main__":
+    """ maybe more convenient to use 'run.sh' to call this tool
+    """
+    net_file = 'models/resnet50/resnet50.py'
+    weight_file = 'models/resnet50/resnet50.npy'
+    imgfile = 'data/65.jpeg'
+    net_name = 'ResNet50'
+
+    argc = len(sys.argv)
+    if argc == 5:
+        net_file = sys.argv[1]
+        weight_file = sys.argv[2]
+        imgfile = sys.argv[3]
+        net_name = sys.argv[4]
+    elif argc > 1:
+        print('usage:')
+        print('\tpython %s [net_file] [weight_file] [imgfile] [net_name]' %
+              (sys.argv[0]))
+        print('\teg:python %s %s %s %s %s' % (sys.argv[0], net_file,
+                                              weight_file, imgfile, net_name))
+        sys.exit(1)
+
+    infer(net_file, net_name, weight_file, imgfile)
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh b/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7a1a5ebd7c0a5090c00a0c8ca6b0e11b110967dc
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+#function:
+#   a tool used to:
+#       1, convert a caffe model
+#       2, do inference using this model
+#
+#usage:
+#   bash run.sh resnet50 ./models.caffe/resnet50 ./models/resnet50
+#
+
+#set -x
+if [[ $# -lt 3 ]];then
+    echo "usage:"
+    echo "  bash $0 [model_name] [cf_model_path] [pd_model_path] [only_convert]"
+    echo "  eg: bash $0 resnet50 ./models.caffe/resnet50 ./models/resnet50"
+    exit 1
+else
+    model_name=$1
+    cf_model_path=$2
+    pd_model_path=$3
+    only_convert=$4
+fi
+
+proto_file=$cf_model_path/${model_name}.prototxt
+caffemodel_file=$cf_model_path/${model_name}.caffemodel
+weight_file=$pd_model_path/${model_name}.npy
+net_file=$pd_model_path/${model_name}.py
+
+if [[ ! -e $proto_file ]];then
+    echo "not found prototxt[$proto_file]"
+    exit 1
+fi
+
+if [[ ! -e $caffemodel_file ]];then
+    echo "not found caffemodel[$caffemodel_file]"
+    exit 1
+fi
+
+if [[ ! -e $pd_model_path ]];then
+    mkdir $pd_model_path
+fi
+
+PYTHON=`which cfpython`
+if [[ -z $PYTHON ]];then
+    PYTHON=`which python`
+fi
+$PYTHON ../../convert.py \
+        $proto_file \
+        --caffemodel $caffemodel_file \
+        --data-output-path $weight_file\
+        --code-output-path $net_file
+
+ret=$?
+if [[ $ret -ne 0 ]];then
+    echo "failed to convert caffe model[$cf_model_path]"
+    exit $ret
+else
+    echo "succeed to convert caffe model[$cf_model_path] to fluid model[$pd_model_path]"
+fi
+
+if [[ -z $only_convert ]];then
+    PYTHON=`which pdpython`
+    if [[ -z $PYTHON ]];then
+        PYTHON=`which python`
+    fi
+    imgfile="data/65.jpeg"
+    net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/\"([^\"]+)\"/){ print $1."\n";}'`
+    $PYTHON ./infer.py $net_file $weight_file $imgfile $net_name
+    ret=$?
+fi
+exit $ret
diff --git a/fluid/image_classification/caffe2fluid/examples/mnist/README.md b/fluid/image_classification/caffe2fluid/examples/mnist/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cd427d632737c8988403f987d86c159500022198
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/examples/mnist/README.md
@@ -0,0 +1,10 @@
+a demo to show converting caffe model on 'mnist' using caffe2fluid
+
+---
+
+# How to use
+
+1. prepare python environment
+2. download caffe model to "models.caffe/lenet" which contains "lenet.caffemodel" and "lenet.prototxt"
+3. run the tool
+    eg: bash ./run.sh lenet ./models.caffe/lenet ./models/lenet
diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/predict.py b/fluid/image_classification/caffe2fluid/examples/mnist/evaluate.py
similarity index 66%
rename from fluid/image_classification/caffe2fluid/tests/lenet/predict.py
rename to fluid/image_classification/caffe2fluid/examples/mnist/evaluate.py
index 7405cc6f848ea139bc4edd4c3ec0e0af773ea25a..5c86635d5a014262bdec40fe063915350c5fadb3 100644
--- a/fluid/image_classification/caffe2fluid/tests/lenet/predict.py
+++ b/fluid/image_classification/caffe2fluid/examples/mnist/evaluate.py
@@ -4,12 +4,12 @@
 #   demo to show how to use converted model using caffe2fluid
 #
 
+import sys
+import os
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 
-from lenet import LeNet as MyNet
-
 
 def test_model(exe, test_program, fetch_list, test_reader, feeder):
     acc_set = []
@@ -24,10 +24,15 @@ def test_model(exe, test_program, fetch_list, test_reader, feeder):
     return float(acc_val)
 
 
-def main(model_path):
+def evaluate(net_file, model_file):
     """ main
     """
-    print('load fluid model in %s' % (model_path))
+    #1, build model
+    net_path = os.path.dirname(net_file)
+    if net_path not in sys.path:
+        sys.path.insert(0, net_path)
+
+    from lenet import LeNet as MyNet
 
     with_gpu = False
     paddle.init(use_gpu=with_gpu)
@@ -45,10 +50,10 @@ def main(model_path):
     exe.run(fluid.default_startup_program())
 
     #2, load weights
-    if model_path.find('.npy') > 0:
-        net.load(data_path=model_path, exe=exe, place=place)
+    if model_file.find('.npy') > 0:
+        net.load(data_path=model_file, exe=exe, place=place)
     else:
-        net.load(data_path=model_path, exe=exe)
+        net.load(data_path=model_file, exe=exe)
 
     #3, test this model
     test_program = fluid.default_main_program().clone()
@@ -65,10 +70,17 @@ def main(model_path):
 
 
 if __name__ == "__main__":
-    import sys
-    if len(sys.argv) == 2:
-        fluid_model_path = sys.argv[1]
-    else:
-        fluid_model_path = './model.fluid'
-
-    main(fluid_model_path)
+    net_file = 'models/lenet/lenet.py'
+    weight_file = 'models/lenet/lenet.npy'
+
+    argc = len(sys.argv)
+    if argc == 3:
+        net_file = sys.argv[1]
+        weight_file = sys.argv[2]
+    elif argc > 1:
+        print('usage:')
+        print('\tpython %s [net_file] [weight_file]' % (sys.argv[0]))
+        print('\teg:python %s %s %s %s' % (sys.argv[0], net_file, weight_file))
+        sys.exit(1)
+
+    evaluate(net_file, weight_file)
diff --git a/fluid/image_classification/caffe2fluid/examples/mnist/run.sh b/fluid/image_classification/caffe2fluid/examples/mnist/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..eee83ef7cefd594c62fd95db525f081a27c6ea38
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/examples/mnist/run.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+#function:
+#   a tool used to:
+#       1, convert a caffe model
+#       2, do inference using this model
+#
+#usage:
+#   bash run.sh lenet ./models.caffe/lenet ./models/lenet
+#
+
+#set -x
+if [[ $# -lt 3 ]];then
+    echo "usage:"
+    echo "  bash $0 [model_name] [cf_model_path] [pd_model_path] [only_convert]"
+    echo "  eg: bash $0 lenet ./models.caffe/lenet ./models/lenet"
+    exit 1
+else
+    model_name=$1
+    cf_model_path=$2
+    pd_model_path=$3
+    no_eval=$4
+fi
+
+proto_file=$cf_model_path/${model_name}.prototxt
+caffemodel_file=$cf_model_path/${model_name}.caffemodel
+weight_file=$pd_model_path/${model_name}.npy
+net_file=$pd_model_path/${model_name}.py
+
+if [[ ! -e $proto_file ]];then
+    echo "not found prototxt[$proto_file]"
+    exit 1
+fi
+
+if [[ ! -e $caffemodel_file ]];then
+    echo "not found caffemodel[$caffemodel_file]"
+    exit 1
+fi
+
+if [[ ! -e $pd_model_path ]];then
+    mkdir $pd_model_path
+fi
+
+PYTHON=`which cfpython`
+if [[ -z $PYTHON ]];then
+    PYTHON=`which python`
+fi
+$PYTHON ../../convert.py \
+        $proto_file \
+        --caffemodel $caffemodel_file \
+        --data-output-path $weight_file\
+        --code-output-path $net_file
+
+ret=$?
+if [[ $ret -ne 0 ]];then
+    echo "failed to convert caffe model[$cf_model_path]"
+    exit $ret
+else
+    echo "succeed to convert caffe model[$cf_model_path] to fluid model[$pd_model_path]"
+fi
+
+if [[ -z $only_convert ]];then
+    PYTHON=`which pdpython`
+    if [[ -z $PYTHON ]];then
+        PYTHON=`which python`
+    fi
+    net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/\"([^\"]+)\"/){ print $1."\n";}'`
+    if [[ $net_name != "LeNet" ]];then
+        echo "only support LeNet"
+        exit 1
+    fi
+    $PYTHON ./evaluate.py $net_file $weight_file
+    ret=$?
+fi
+exit $ret
diff --git a/fluid/image_classification/caffe2fluid/kaffe/caffe/resolver.py b/fluid/image_classification/caffe2fluid/kaffe/caffe/resolver.py
index 5fbd48d3ade5ab4b812210acf82be625871740cb..6ad7767ed8a88f1c0258ad36cc35221c33b641e5 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/caffe/resolver.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/caffe/resolver.py
@@ -54,7 +54,6 @@ def show_fallback_warning():
     WARNING: PyCaffe not found!
     Falling back to a pure protocol buffer implementation.
     * Conversions will be drastically slower.
-    * This backend is UNTESTED!
 ------------------------------------------------------------
 
 '''
diff --git a/fluid/image_classification/caffe2fluid/kaffe/graph.py b/fluid/image_classification/caffe2fluid/kaffe/graph.py
index cb751dffa1ca9cc19214bed12681312942046df6..5387f441852b8a318a41898ee0b62b4903ccdabb 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/graph.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/graph.py
@@ -175,6 +175,7 @@ class GraphBuilder(object):
         kind = NodeKind.map_raw_kind(layer.type)
         if kind is None:
             raise KaffeError('Unknown layer type encountered: %s' % layer.type)
+
         # We want to use the layer's top names (the "output" names), rather than the
         # name attribute, which is more of readability thing than a functional one.
         # Other layers will refer to a node by its "top name".
@@ -235,6 +236,7 @@ class GraphBuilder(object):
                 node.add_parent(parent_node)
             if len(layer.top) > 1:
                 raise KaffeError('Multiple top nodes are not supported.')
+
             for output_name in layer.top:
                 if output_name == layer.name:
                     # Output is named the same as the node. No further action required.
diff --git a/fluid/image_classification/caffe2fluid/kaffe/layers.py b/fluid/image_classification/caffe2fluid/kaffe/layers.py
index 6be35ed727fed76a1c96017455bdaa354ace9f97..f263407ab41458573f2df775f99202bed0e9d894 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/layers.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/layers.py
@@ -51,17 +51,79 @@ LAYER_DESCRIPTORS = {
     'Threshold': shape_identity,
 }
 
-LAYER_TYPES = LAYER_DESCRIPTORS.keys()
+# layer types in 'V1LayerParameter'
+# (v1layertype name, enum value, mapped to layer type)
+v1_layertypes = [
+    ('ABSVAL', 35),
+    ('ACCURACY', 1),
+    ('ARGMAX', 30),
+    ('BNLL', 2),
+    ('CONCAT', 3),
+    ('CONVOLUTION', 4),
+    ('DATA', 5),
+    ('DECONVOLUTION', 39),
+    ('DROPOUT', 6),
+    ('ELTWISE', 25),
+    ('EXP', 38),
+    ('FLATTEN', 8),
+    ('IM2COL', 11),
+    ('INNERPRODUCT', 14),
+    ('LRN', 15),
+    ('MEMORYDATA', 29),
+    ('MULTINOMIALLOGISTICLOSS', 16),
+    ('MVN', 34),
+    ('POOLING', 17),
+    ('POWER', 26),
+    ('RELU', 18),
+    ('SIGMOID', 19),
+    ('SIGMOIDCROSSENTROPYLOSS', 27),
+    ('SILENCE', 36),
+    ('SOFTMAX', 20),
+    ('SPLIT', 22),
+    ('SLICE', 33),
+    ('TANH', 23),
+    ('WINDOWDATA', 24),
+    ('THRESHOLD', 31),
+]
 
+LAYER_TYPES = LAYER_DESCRIPTORS.keys()
 LayerType = type('LayerType', (), {t: t for t in LAYER_TYPES})
 
+#map the layer name in V1 to standard name
+V1_LAYER_MAP = {'_not_init_': True}
+
+
+def get_v1_layer_map():
+    global V1_LAYER_MAP
+    if '_not_init_' not in V1_LAYER_MAP:
+        return V1_LAYER_MAP
+    else:
+        del V1_LAYER_MAP['_not_init_']
+
+    name2layer = {}
+    for n in LAYER_TYPES:
+        name2layer[n.upper()] = n
+
+    for l in v1_layertypes:
+        n, v = l
+        if n in name2layer and v not in V1_LAYER_MAP:
+            V1_LAYER_MAP[v] = name2layer[n]
+        else:
+            raise KaffeError('not found v1 layer type %s' % n)
+    return V1_LAYER_MAP
+
 
 class NodeKind(LayerType):
     @staticmethod
     def map_raw_kind(kind):
         if kind in LAYER_TYPES:
             return kind
-        return None
+
+        v1_layers = get_v1_layer_map()
+        if kind in v1_layers:
+            return v1_layers[kind]
+        else:
+            return None
 
     @staticmethod
     def compute_output_shape(node):
diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
index 620a84e8f1289672151f1f280559a56b37995ce0..fd6a71cb6acbfffe2aed1d3680fb91c8c85dc3d3 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
@@ -27,6 +27,9 @@ def layer(op):
         self.layers[name] = layer_output
         # This output is now the input for the next layer.
         self.feed(layer_output)
+        #print('output shape of %s:' % (name))
+        #print layer_output.shape
+
         # Return self for chained calls.
         return self
 
@@ -158,41 +161,64 @@ class Network(object):
         output = fluid.layers.relu(x=input)
         return output
 
-    @layer
-    def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
-        if padding is None:
-            padding = [0, 0]
+    def _adjust_pad_if_needed(self, i_hw, k_hw, s_hw, p_hw):
+        #adjust the padding if needed
+        i_h, i_w = i_hw
+        k_h, k_w = k_hw
+        s_h, s_w = s_hw
+        p_h, p_w = p_hw
+
+        def is_consistent(i, k, s, p):
+            o = i + 2 * p - k
+            if o % s == 0:
+                return True
+            else:
+                return False
+
+        real_p_h = 0
+        real_p_w = 0
+        if is_consistent(i_h, k_h, s_h, p_h) is False:
+            real_p_h = int(k_h / 2)
 
+        if is_consistent(i_w, k_w, s_w, p_w) is False:
+            real_p_w = int(k_w / 2)
+
+        return [real_p_h, real_p_w]
+
+    def pool(self, pool_type, input, k_h, k_w, s_h, s_w, name, padding):
         # Get the number of channels in the input
-        h_i, w_i = input.shape[2:]
-        fluid = import_fluid()
-        output = fluid.layers.pool2d(
-            input=input,
-            pool_size=[k_h, k_w],
-            pool_stride=[s_h, s_w],
-            pool_padding=padding,
-            pool_type='max')
-        return output
+        in_hw = input.shape[2:]
+        k_hw = [k_h, k_w]
+        s_hw = [s_h, s_w]
 
-    @layer
-    def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
         if padding is None:
-            padding = [0, 0]
+            #fix bug about the difference between conv and pool
+            #more info: https://github.com/BVLC/caffe/issues/1318
+            padding = self._adjust_pad_if_needed(in_hw, k_hw, s_hw, [0, 0])
 
-        # Get the number of channels in the input
-        h_i, w_i = input.shape[2:]
         fluid = import_fluid()
         output = fluid.layers.pool2d(
             input=input,
-            pool_size=[k_h, k_w],
-            pool_stride=[s_h, s_w],
+            pool_size=k_hw,
+            pool_stride=s_hw,
             pool_padding=padding,
-            pool_type='avg')
+            pool_type=pool_type)
         return output
 
+    @layer
+    def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
+        return self.pool('max', input, k_h, k_w, s_h, s_w, name, padding)
+
+    @layer
+    def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
+        return self.pool('avg', input, k_h, k_w, s_h, s_w, name, padding)
+
     @layer
     def lrn(self, input, radius, alpha, beta, name, bias=1.0):
-        raise Exception('lrn() not implemented yet')
+        fluid = import_fluid()
+        output = fluid.layers.lrn(input=input, \
+                n=radius, k=bias, alpha=alpha, beta=beta, name=name)
+        return output
 
     @layer
     def concat(self, inputs, axis, name):
@@ -228,7 +254,7 @@ class Network(object):
     @layer
     def softmax(self, input, name):
         fluid = import_fluid()
-        output = fluid.layers.softmax(x=input, name=name)
+        output = fluid.layers.softmax(input)
         return output
 
     @layer
@@ -256,5 +282,8 @@ class Network(object):
         return output
 
     @layer
-    def dropout(self, input, keep_prob, name):
-        raise Exception('dropout() not implemented yet')
+    def dropout(self, input, drop_prob, name, is_test=True):
+        fluid = import_fluid()
+        output = fluid.layers.dropout(
+            input, dropout_prob=drop_prob, is_test=is_test, name=name)
+        return output
diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
index 92b9d32a3a755d8e6a2a8739cc3f42f9c8564b40..4d7ec49a39199bb1415f830d88f89e93a4b95266 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
@@ -132,8 +132,7 @@ class TensorFlowMapper(NodeMapper):
         # just scales by alpha (as does Krizhevsky's paper).
         # We'll account for that here.
         alpha = params.alpha / float(params.local_size)
-        return TensorFlowNode('lrn',
-                              int(params.local_size / 2), alpha, params.beta)
+        return TensorFlowNode('lrn', params.local_size, alpha, params.beta)
 
     def map_concat(self, node):
         return TensorFlowNode('concat', node.parameters.axis)
@@ -191,22 +190,33 @@ class TensorFlowEmitter(object):
     def emit_setup_def(self):
         return self.statement('def setup(self):')
 
-    def emit_convert_def(self, input_nodes):
-        def data_layer_def(name, shape, dtype=None):
-            if dtype is None:
-                dtype = 'float32'
+    def emit_shape_def(self, input_nodes):
+        self.outdent()
+        func_def = self.statement('@classmethod')
+        func_def += self.statement('def input_shapes(cls):')
+        self.indent()
 
-            layer_var = name + '_layer'
-            shape = [str(s) for s in shape[1:]]
-            layer_def = '%s = fluid.layers.data(name="%s", shape=[%s], dtype="%s")'\
-                    % (layer_var, name, ','.join(shape), dtype)
-            return layer_var, layer_def
+        input_shapes = {}
+        for n in input_nodes:
+            name = n.name
+            output_shape = n.output_shape
+            shape = [str(s) for s in output_shape[1:]]
+            input_shapes[name] = ', '.join(shape)
+        input_shapes = ['"%s": [%s]' % (n, l) for n, l in input_shapes.items()]
+        shape_str = ','.join(input_shapes)
+        func_def += self.statement('return {%s}' % (shape_str))
+        return '\n\n' + func_def
 
+    def emit_convert_def(self, input_nodes):
         codes = []
         inputs = {}
+        codes.append('shapes = cls.input_shapes()')
         for n in input_nodes:
             name = n.name
-            layer_var, layer_def = data_layer_def(n.name, n.output_shape)
+            layer_var = name + '_layer'
+            layer_def = '%s = fluid.layers.data(name="%s", shape=shapes["%s"],'\
+                    ' dtype="float32")' % (layer_var, name, name)
+            #layer_var, layer_def = data_layer_def(n.name, n.output_shape)
             codes.append(layer_def)
             inputs[name] = layer_var
 
@@ -229,7 +239,7 @@ class TensorFlowEmitter(object):
         func_def += self.statement('import paddle.v2.fluid as fluid')
         for l in codes:
             func_def += self.statement(l)
-        return '\n\n' + func_def
+        return '\n' + func_def
 
     def emit_main_def(self, name):
         if name is None:
@@ -273,6 +283,7 @@ class TensorFlowEmitter(object):
                 b += self.emit_node(node)
             blocks.append(b[:-1])
         s = s + '\n\n'.join(blocks)
+        s += self.emit_shape_def(input_nodes)
         s += self.emit_convert_def(input_nodes)
         s += self.emit_main_def(name)
         return s
diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/README.md b/fluid/image_classification/caffe2fluid/tests/lenet/README.md
deleted file mode 100644
index 982edc2aa67f43f849bb2523b1a15edaa02f5d28..0000000000000000000000000000000000000000
--- a/fluid/image_classification/caffe2fluid/tests/lenet/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### Convert lenet model from caffe format into paddle format(fluid api)
-
-### Howto
-1, Prepare your caffepb.py
-
-2, Download a lenet caffe-model
-    lenet_iter_10000.caffemodel
-        download address: https://github.com/ethereon/caffe-tensorflow/raw/master/examples/mnist/lenet_iter_10000.caffemodel
-        md5: cbec75c1c374b6c1981c4a1eb024ae01  
-
-    lenet.prototxt
-        download address: https://raw.githubusercontent.com/BVLC/caffe/master/examples/mnist/lenet.prototxt
-        md5: 27384af843338ab90b00c8d1c81de7d5
-
-
-2, Convert this model(make sure caffepb.py is ready in ../../proto)
-    convert to npy format
-        bash ./convert.sh lenet.prototxt lenet.caffemodel lenet.py lenet.npy
-
-    save to fluid format(optional)
-        bash ./convert.sh lenet.prototxt lenet.caffemodel lenet.py lenet.npy && python ./lenet.py ./lenet.npy ./fluid.model
-
-4, Use this new model(paddle installed in this python)
-    use fluid format
-        python ./predict.py ./fluid.model
-
-    use npy format
-        python ./predict.py ./lenet.npy
diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/convert.sh b/fluid/image_classification/caffe2fluid/tests/lenet/convert.sh
deleted file mode 100755
index b3ec1a1dce2434a4466cf5d4609de1b4aec9d346..0000000000000000000000000000000000000000
--- a/fluid/image_classification/caffe2fluid/tests/lenet/convert.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-#function:
-#   convert a caffe model
-#   eg:
-#       bash ./convert.sh ./model.caffe/lenet.prototxt ./model.caffe/lenet.caffemodel lenet.py lenet.npy
-
-if [[ $# -ne 4 ]];then
-    echo "usage:"
-    echo "  bash $0 [PROTOTXT] [CAFFEMODEL] [PY_NAME] [WEIGHT_NAME]"
-    echo "  eg: bash $0 lenet.prototxt lenet.caffemodel lenet.py lenet.npy"
-    exit 1
-fi
-
-WORK_ROOT=$(dirname `readlink -f ${BASH_SOURCE[0]}`)
-if [[ -z $PYTHON ]];then
-    PYTHON=`which python`
-fi
-
-PROTOTXT=$1
-CAFFEMODEL=$2
-PY_NAME=$3
-WEIGHT_NAME=$4
-CONVERTER_PY="$WORK_ROOT/../../convert.py"
-
-$PYTHON $CONVERTER_PY $PROTOTXT --caffemodel $CAFFEMODEL --code-output-path=$PY_NAME --data-output-path=$WEIGHT_NAME
-ret=$?
-if [[ $ret -eq 0 ]];then
-    echo "succeed to convert caffe model[$CAFFEMODEL, $PROTOTXT] to paddle model[$PY_NAME, $WEIGHT_NAME]"
-else
-    echo "failed to convert caffe model[$CAFFEMODEL, $PROTOTXT]"
-fi
-exit $ret
diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/lenet.npy b/fluid/image_classification/caffe2fluid/tests/lenet/lenet.npy
deleted file mode 100644
index 66f773e5ffd54c8f5151b920aecdf3dd4f8c91d2..0000000000000000000000000000000000000000
Binary files a/fluid/image_classification/caffe2fluid/tests/lenet/lenet.npy and /dev/null differ
diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/lenet.py b/fluid/image_classification/caffe2fluid/tests/lenet/lenet.py
deleted file mode 100644
index 50e6927483a61c574f1152c6dc438a6b2c8a4d90..0000000000000000000000000000000000000000
--- a/fluid/image_classification/caffe2fluid/tests/lenet/lenet.py
+++ /dev/null
@@ -1,297 +0,0 @@
-### generated by caffe2fluid, your net is in class "LeNet" ###
-
-import math
-import os
-import numpy as np
-
-
-def import_fluid():
-    import paddle.v2.fluid as fluid
-    return fluid
-
-
-def layer(op):
-    '''Decorator for composable network layers.'''
-
-    def layer_decorated(self, *args, **kwargs):
-        # Automatically set a name if not provided.
-        name = kwargs.setdefault('name', self.get_unique_name(op.__name__))
-        # Figure out the layer inputs.
-        if len(self.terminals) == 0:
-            raise RuntimeError('No input variables found for layer %s.' % name)
-        elif len(self.terminals) == 1:
-            layer_input = self.terminals[0]
-        else:
-            layer_input = list(self.terminals)
-        # Perform the operation and get the output.
-        layer_output = op(self, layer_input, *args, **kwargs)
-        # Add to layer LUT.
-        self.layers[name] = layer_output
-        # This output is now the input for the next layer.
-        self.feed(layer_output)
-        # Return self for chained calls.
-        return self
-
-    return layer_decorated
-
-
-class Network(object):
-    def __init__(self, inputs, trainable=True):
-        # The input nodes for this network
-        self.inputs = inputs
-        # The current list of terminal nodes
-        self.terminals = []
-        # Mapping from layer names to layers
-        self.layers = dict(inputs)
-        # If true, the resulting variables are set as trainable
-        self.trainable = trainable
-        # Switch variable for dropout
-        self.paddle_env = None
-        self.setup()
-
-    def setup(self):
-        '''Construct the network. '''
-        raise NotImplementedError('Must be implemented by the subclass.')
-
-    def load(self, data_path, exe=None, place=None, ignore_missing=False):
-        '''Load network weights.
-        data_path: The path to the numpy-serialized network weights
-        ignore_missing: If true, serialized weights for missing layers are ignored.
-        '''
-        fluid = import_fluid()
-        #load fluid mode directly
-        if os.path.isdir(data_path):
-            assert (exe is not None), \
-                'must provide a executor to load fluid model'
-            fluid.io.load_persistables_if_exist(executor=exe, dirname=data_path)
-            return True
-
-        #load model from a npy file
-        if exe is None or place is None:
-            if self.paddle_env is None:
-                place = fluid.CPUPlace()
-                exe = fluid.Executor(place)
-                self.paddle_env = {'place': place, 'exe': exe}
-                exe = exe.run(fluid.default_startup_program())
-            else:
-                place = self.paddle_env['place']
-                exe = self.paddle_env['exe']
-
-        data_dict = np.load(data_path).item()
-        for op_name in data_dict:
-            layer = self.layers[op_name]
-            for param_name, data in data_dict[op_name].iteritems():
-                try:
-                    name = '%s_%s' % (op_name, param_name)
-                    v = fluid.global_scope().find_var(name)
-                    w = v.get_tensor()
-                    w.set(data, place)
-                except ValueError:
-                    if not ignore_missing:
-                        raise
-        return True
-
-    def feed(self, *args):
-        '''Set the input(s) for the next operation by replacing the terminal nodes.
-        The arguments can be either layer names or the actual layers.
-        '''
-        assert len(args) != 0
-        self.terminals = []
-        for fed_layer in args:
-            if isinstance(fed_layer, basestring):
-                try:
-                    fed_layer = self.layers[fed_layer]
-                except KeyError:
-                    raise KeyError('Unknown layer name fed: %s' % fed_layer)
-            self.terminals.append(fed_layer)
-        return self
-
-    def get_output(self):
-        '''Returns the current network output.'''
-        return self.terminals[-1]
-
-    def get_unique_name(self, prefix):
-        '''Returns an index-suffixed unique name for the given prefix.
-        This is used for auto-generating layer names based on the type-prefix.
-        '''
-        ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1
-        return '%s_%d' % (prefix, ident)
-
-    @layer
-    def conv(self,
-             input,
-             k_h,
-             k_w,
-             c_o,
-             s_h,
-             s_w,
-             name,
-             relu=True,
-             padding=None,
-             group=1,
-             biased=True):
-        if padding is None:
-            padding = [0, 0]
-
-        # Get the number of channels in the input
-        c_i, h_i, w_i = input.shape[1:]
-
-        # Verify that the grouping parameter is valid
-        assert c_i % group == 0
-        assert c_o % group == 0
-
-        fluid = import_fluid()
-        prefix = name + '_'
-        output = fluid.layers.conv2d(
-            input=input,
-            filter_size=[k_h, k_w],
-            num_filters=c_o,
-            stride=[s_h, s_w],
-            padding=padding,
-            groups=group,
-            param_attr=fluid.ParamAttr(name=prefix + "weights"),
-            bias_attr=fluid.ParamAttr(name=prefix + "biases"),
-            act="relu" if relu is True else None)
-        return output
-
-    @layer
-    def relu(self, input, name):
-        fluid = import_fluid()
-        output = fluid.layers.relu(x=input)
-        return output
-
-    @layer
-    def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
-        if padding is None:
-            padding = [0, 0]
-
-        # Get the number of channels in the input
-        h_i, w_i = input.shape[2:]
-        fluid = import_fluid()
-        output = fluid.layers.pool2d(
-            input=input,
-            pool_size=[k_h, k_w],
-            pool_stride=[s_h, s_w],
-            pool_padding=padding,
-            pool_type='max')
-        return output
-
-    @layer
-    def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
-        if padding is None:
-            padding = [0, 0]
-
-        # Get the number of channels in the input
-        h_i, w_i = input.shape[2:]
-        fluid = import_fluid()
-        output = fluid.layers.pool2d(
-            input=input,
-            pool_size=[k_h, k_w],
-            pool_stride=[s_h, s_w],
-            pool_padding=padding,
-            pool_type='avg')
-        return output
-
-    @layer
-    def lrn(self, input, radius, alpha, beta, name, bias=1.0):
-        raise Exception('lrn() not implemented yet')
-
-    @layer
-    def concat(self, inputs, axis, name):
-        fluid = import_fluid()
-        output = fluid.layers.concat(input=inputs, axis=axis)
-        return output
-
-    @layer
-    def add(self, inputs, name):
-        fluid = import_fluid()
-        output = inputs[0]
-        for i in inputs[1:]:
-            output = fluid.layers.elementwise_add(x=output, y=i)
-        return output
-
-    @layer
-    def fc(self, input, num_out, name, relu=True, act=None):
-        fluid = import_fluid()
-
-        if act is None:
-            act = 'relu' if relu is True else None
-
-        prefix = name + '_'
-        output = fluid.layers.fc(
-            name=name,
-            input=input,
-            size=num_out,
-            act=act,
-            param_attr=fluid.ParamAttr(name=prefix + 'weights'),
-            bias_attr=fluid.ParamAttr(name=prefix + 'biases'))
-        return output
-
-    @layer
-    def softmax(self, input, name):
-        fluid = import_fluid()
-        output = fluid.layers.softmax(x=input, name=name)
-        return output
-
-    @layer
-    def batch_normalization(self, input, name, scale_offset=True, relu=False):
-        # NOTE: Currently, only inference is supported
-        fluid = import_fluid()
-        prefix = name + '_'
-        param_attr = None if scale_offset is False else fluid.ParamAttr(
-            name=prefix + 'scale')
-        bias_attr = None if scale_offset is False else fluid.ParamAttr(
-            name=prefix + 'offset')
-        mean_name = prefix + 'mean'
-        variance_name = prefix + 'variance'
-        output = fluid.layers.batch_norm(
-            name=name,
-            input=input,
-            is_test=True,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            moving_mean_name=mean_name,
-            moving_variance_name=variance_name,
-            epsilon=1e-5,
-            act='relu' if relu is True else None)
-
-        return output
-
-    @layer
-    def dropout(self, input, keep_prob, name):
-        raise Exception('dropout() not implemented yet')
-
-
-class LeNet(Network):
-    def setup(self):
-        self.feed('data')
-        self.conv(5, 5, 20, 1, 1, relu=False, name='conv1')
-        self.max_pool(2, 2, 2, 2, name='pool1')
-        self.conv(5, 5, 50, 1, 1, relu=False, name='conv2')
-        self.max_pool(2, 2, 2, 2, name='pool2')
-        self.fc(500, name='ip1')
-        self.fc(10, relu=False, name='ip2')
-        self.softmax(name='prob')
-
-    @classmethod
-    def convert(cls, npy_model, fluid_path):
-        import paddle.v2.fluid as fluid
-        data_layer = fluid.layers.data(
-            name="data", shape=[1, 28, 28], dtype="float32")
-        feed_data = {"data": data_layer}
-        net = cls(feed_data)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        net.load(data_path=npy_model, exe=exe, place=place)
-        fluid.io.save_persistables(executor=exe, dirname=fluid_path)
-
-
-if __name__ == "__main__":
-    #usage: python xxxnet.py xxx.npy ./model
-
-    import sys
-    npy_weight = sys.argv[1]
-    fluid_model = sys.argv[2]
-    LeNet.convert(npy_weight, fluid_model)
-    exit(0)
diff --git a/fluid/image_classification/se_resnext.py b/fluid/image_classification/se_resnext.py
index c2b2d680fc995b1ea6cc5a2f640746a8a79ac029..b1adf0baba8a987ae1a971e148375c6a0730d860 100644
--- a/fluid/image_classification/se_resnext.py
+++ b/fluid/image_classification/se_resnext.py
@@ -1,4 +1,7 @@
 import os
+import numpy as np
+import time
+import sys
 import paddle.v2 as paddle
 import paddle.fluid as fluid
 import reader
@@ -65,20 +68,44 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
     return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
 
 
-def SE_ResNeXt(input, class_dim, infer=False):
-    cardinality = 64
-    reduction_ratio = 16
-    depth = [3, 8, 36, 3]
-    num_filters = [128, 256, 512, 1024]
+def SE_ResNeXt(input, class_dim, infer=False, layers=50):
+    supported_layers = [50, 152]
+    if layers not in supported_layers:
+        print("supported layers are", supported_layers, "but input layer is",
+              layers)
+        exit()
+    if layers == 50:
+        cardinality = 32
+        reduction_ratio = 16
+        depth = [3, 4, 6, 3]
+        num_filters = [128, 256, 512, 1024]
 
-    conv = conv_bn_layer(
-        input=input, num_filters=64, filter_size=3, stride=2, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
-    conv = fluid.layers.pool2d(
-        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+        conv = conv_bn_layer(
+            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+    elif layers == 152:
+        cardinality = 64
+        reduction_ratio = 16
+        depth = [3, 8, 36, 3]
+        num_filters = [128, 256, 512, 1024]
+
+        conv = conv_bn_layer(
+            input=input, num_filters=64, filter_size=3, stride=2, act='relu')
+        conv = conv_bn_layer(
+            input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+        conv = conv_bn_layer(
+            input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
 
     for block in range(len(depth)):
         for i in range(depth[block]):
@@ -104,7 +131,10 @@ def train(learning_rate,
           num_passes,
           init_model=None,
           model_save_dir='model',
-          parallel=True):
+          parallel=True,
+          use_nccl=True,
+          lr_strategy=None,
+          layers=50):
     class_dim = 1000
     image_shape = [3, 224, 224]
 
@@ -113,36 +143,52 @@ def train(learning_rate,
 
     if parallel:
         places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places)
+        pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
 
         with pd.do():
             image_ = pd.read_input(image)
             label_ = pd.read_input(label)
-            out = SE_ResNeXt(input=image_, class_dim=class_dim)
+            out = SE_ResNeXt(input=image_, class_dim=class_dim, layers=layers)
             cost = fluid.layers.cross_entropy(input=out, label=label_)
             avg_cost = fluid.layers.mean(x=cost)
-            accuracy = fluid.layers.accuracy(input=out, label=label_)
+            acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1)
+            acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5)
             pd.write_output(avg_cost)
-            pd.write_output(accuracy)
+            pd.write_output(acc_top1)
+            pd.write_output(acc_top5)
 
-        avg_cost, accuracy = pd()
+        avg_cost, acc_top1, acc_top5 = pd()
         avg_cost = fluid.layers.mean(x=avg_cost)
-        accuracy = fluid.layers.mean(x=accuracy)
+        acc_top1 = fluid.layers.mean(x=acc_top1)
+        acc_top5 = fluid.layers.mean(x=acc_top5)
     else:
-        out = SE_ResNeXt(input=image, class_dim=class_dim)
+        out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
         cost = fluid.layers.cross_entropy(input=out, label=label)
         avg_cost = fluid.layers.mean(x=cost)
-        accuracy = fluid.layers.accuracy(input=out, label=label)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+    if lr_strategy is None:
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+    else:
+        bd = lr_strategy["bd"]
+        lr = lr_strategy["lr"]
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
 
-    optimizer = fluid.optimizer.Momentum(
-        learning_rate=learning_rate,
-        momentum=0.9,
-        regularization=fluid.regularizer.L2Decay(1e-4))
     opts = optimizer.minimize(avg_cost)
+    fluid.memory_optimize(fluid.default_main_program())
 
     inference_program = fluid.default_main_program().clone()
     with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program([avg_cost, accuracy])
+        inference_program = fluid.io.get_inference_program(
+            [avg_cost, acc_top1, acc_top5])
 
     place = fluid.CUDAPlace(0)
     exe = fluid.Executor(place)
@@ -156,34 +202,86 @@ def train(learning_rate,
     feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
 
     for pass_id in range(num_passes):
+        train_info = [[], [], []]
+        test_info = [[], [], []]
         for batch_id, data in enumerate(train_reader()):
-            loss = exe.run(fluid.default_main_program(),
-                           feed=feeder.feed(data),
-                           fetch_list=[avg_cost])
-            print("Pass {0}, batch {1}, loss {2}".format(pass_id, batch_id,
-                                                         float(loss[0])))
-
-        total_loss = 0.0
-        total_acc = 0.0
-        total_batch = 0
+            t1 = time.time()
+            loss, acc1, acc5 = exe.run(
+                fluid.default_main_program(),
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost, acc_top1, acc_top5])
+            t2 = time.time()
+            period = t2 - t1
+            train_info[0].append(loss[0])
+            train_info[1].append(acc1[0])
+            train_info[2].append(acc5[0])
+            if batch_id % 10 == 0:
+                print("Pass {0}, trainbatch {1}, loss {2}, \
+                       acc1 {3}, acc5 {4} time {5}"
+                                                   .format(pass_id, \
+                       batch_id, loss[0], acc1[0], acc5[0], \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        train_loss = np.array(train_info[0]).mean()
+        train_acc1 = np.array(train_info[1]).mean()
+        train_acc5 = np.array(train_info[2]).mean()
         for data in test_reader():
-            loss, acc = exe.run(inference_program,
-                                feed=feeder.feed(data),
-                                fetch_list=[avg_cost, accuracy])
-            total_loss += float(loss)
-            total_acc += float(acc)
-            total_batch += 1
-        print("End pass {0}, test_loss {1}, test_acc {2}".format(
-            pass_id, total_loss / total_batch, total_acc / total_batch))
+            t1 = time.time()
+            loss, acc1, acc5 = exe.run(
+                inference_program,
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost, acc_top1, acc_top5])
+            t2 = time.time()
+            period = t2 - t1
+            test_info[0].append(loss[0])
+            test_info[1].append(acc1[0])
+            test_info[2].append(acc5[0])
+            if batch_id % 10 == 0:
+                print("Pass {0},testbatch {1},loss {2}, \
+                       acc1 {3},acc5 {4},time {5}"
+                                                  .format(pass_id, \
+                       batch_id, loss[0], acc1[0], acc5[0], \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        test_loss = np.array(test_info[0]).mean()
+        test_acc1 = np.array(test_info[1]).mean()
+        test_acc5 = np.array(test_info[2]).mean()
+
+        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
+               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
+                                                           .format(pass_id, \
+              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
+              test_acc5))
+        sys.stdout.flush()
 
         model_path = os.path.join(model_save_dir, str(pass_id))
-        fluid.io.save_inference_model(model_path, ['image'], [out], exe)
+        if not os.path.isdir(model_path):
+            os.makedirs(model_path)
+        fluid.io.save_persistables(exe, model_path)
 
 
 if __name__ == '__main__':
+    epoch_points = [30, 60, 90]
+    total_images = 1281167
+    batch_size = 256
+    step = int(total_images / batch_size + 1)
+    bd = [e * step for e in epoch_points]
+    lr = [0.1, 0.01, 0.001, 0.0001]
+
+    lr_strategy = {"bd": bd, "lr": lr}
+
+    use_nccl = True
+    # layers: 50, 152
+    layers = 50
+
     train(
         learning_rate=0.1,
-        batch_size=8,
-        num_passes=100,
+        batch_size=batch_size,
+        num_passes=120,
         init_model=None,
-        parallel=False)
+        parallel=True,
+        use_nccl=True,
+        lr_strategy=lr_strategy,
+        layers=layers)
diff --git a/fluid/neural_machine_translation/transformer/config.py b/fluid/neural_machine_translation/transformer/config.py
index b91a8672629e76557a005897f5fe044074633028..29d7c4868a61e489659c2ed04f0ff09ff48b52c8 100644
--- a/fluid/neural_machine_translation/transformer/config.py
+++ b/fluid/neural_machine_translation/transformer/config.py
@@ -22,7 +22,7 @@ class TrainTaskConfig(object):
 class InferTaskConfig(object):
     use_gpu = False
     # the number of examples in one run for sequence generation.
-    batch_size = 10
+    batch_size = 1
 
     # the parameters for beam search.
     beam_size = 5
@@ -92,7 +92,9 @@ encoder_input_data_names = (
     "src_word",
     "src_pos",
     "src_slf_attn_bias",
-    "src_data_shape", )
+    "src_data_shape",
+    "src_slf_attn_pre_softmax_shape",
+    "src_slf_attn_post_softmax_shape", )
 
 # Names of all data layers in decoder listed in order.
 decoder_input_data_names = (
@@ -101,6 +103,10 @@ decoder_input_data_names = (
     "trg_slf_attn_bias",
     "trg_src_attn_bias",
     "trg_data_shape",
+    "trg_slf_attn_pre_softmax_shape",
+    "trg_slf_attn_post_softmax_shape",
+    "trg_src_attn_pre_softmax_shape",
+    "trg_src_attn_post_softmax_shape",
     "enc_output", )
 
 # Names of label related data layers listed in order.
diff --git a/fluid/neural_machine_translation/transformer/infer.py b/fluid/neural_machine_translation/transformer/infer.py
index 5d572c566cb67e0035ad3d2271706dea8d3f1d83..ee7d5d41c0b407e4e9b60fc028e42f256dfbdfd7 100644
--- a/fluid/neural_machine_translation/transformer/infer.py
+++ b/fluid/neural_machine_translation/transformer/infer.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 import model
@@ -27,11 +27,19 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
         is_target=False,
         return_pos=True,
         return_attn_bias=True,
-        return_max_len=True)
-    enc_in_data = enc_in_data[:-1] + [
+        return_max_len=False)
+    # Append the data shape input to reshape the output of embedding layer.
+    enc_in_data = enc_in_data + [
         np.array(
-            [batch_size, enc_in_data[-1], d_model], dtype="int32")
-    ]  # Append the data shape input.
+            [-1, enc_in_data[2].shape[-1], d_model], dtype="int32")
+    ]
+    # Append the shape inputs to reshape before and after softmax in encoder
+    # self attention.
+    enc_in_data = enc_in_data + [
+        np.array(
+            [-1, enc_in_data[2].shape[-1]], dtype="int32"), np.array(
+                enc_in_data[2].shape, dtype="int32")
+    ]
     enc_output = exe.run(encoder,
                          feed=dict(zip(enc_in_names, enc_in_data)),
                          fetch_list=enc_out_names)[0]
@@ -73,8 +81,8 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
         trg_words = np.array(
             [[bos_idx]] * batch_size * beam_size, dtype="int64")
         trg_pos = np.array([[1]] * batch_size * beam_size, dtype="int64")
-        src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[-1][
-            1], enc_in_data[-2], 1
+        src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[2].shape[
+            -1], enc_in_data[2], 1
         # This is used to remove attention on subsequent words.
         trg_slf_attn_bias = np.ones((batch_size * beam_size, trg_max_len,
                                      trg_max_len))
@@ -89,19 +97,38 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
                 -1, src_slf_attn_bias.shape[1], trg_max_len,
                 src_slf_attn_bias.shape[-1]
             ])
+        # Append the shape input to reshape the output of embedding layer.
         trg_data_shape = np.array(
             [batch_size * beam_size, trg_max_len, d_model], dtype="int32")
+        # Append the shape inputs to reshape before and after softmax in
+        # decoder self attention.
+        trg_slf_attn_pre_softmax_shape = np.array(
+            [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
+        trg_slf_attn_post_softmax_shape = np.array(
+            trg_slf_attn_bias.shape, dtype="int32")
+        # Append the shape inputs to reshape before and after softmax in
+        # encoder-decoder attention.
+        trg_src_attn_pre_softmax_shape = np.array(
+            [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
+        trg_src_attn_post_softmax_shape = np.array(
+            trg_src_attn_bias.shape, dtype="int32")
         enc_output = np.tile(
             enc_output[:, np.newaxis], [1, beam_size, 1, 1]).reshape(
                 [-1, enc_output.shape[-2], enc_output.shape[-1]])
-        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, enc_output
+        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
+            trg_src_attn_post_softmax_shape, enc_output
 
     def update_dec_in_data(dec_in_data, next_ids, active_beams, beam_inst_map):
         """
         Update the input data of decoder mainly by slicing from the previous
         input data and dropping the finished instance beams.
         """
-        trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, enc_output = dec_in_data
+        trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
+            trg_src_attn_post_softmax_shape, enc_output = dec_in_data
         trg_cur_len = trg_slf_attn_bias.shape[-1] + 1
         trg_words = np.array(
             [
@@ -129,11 +156,27 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
         trg_src_attn_bias = np.tile(trg_src_attn_bias[
             active_beams_indice, :, ::trg_src_attn_bias.shape[2], :],
                                     [1, 1, trg_cur_len, 1])
+        # Append the shape input to reshape the output of embedding layer.
         trg_data_shape = np.array(
             [len(active_beams) * beam_size, trg_cur_len, d_model],
             dtype="int32")
+        # Append the shape inputs to reshape before and after softmax in
+        # decoder self attention.
+        trg_slf_attn_pre_softmax_shape = np.array(
+            [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
+        trg_slf_attn_post_softmax_shape = np.array(
+            trg_slf_attn_bias.shape, dtype="int32")
+        # Append the shape inputs to reshape before and after softmax in
+        # encoder-decoder attention.
+        trg_src_attn_pre_softmax_shape = np.array(
+            [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
+        trg_src_attn_post_softmax_shape = np.array(
+            trg_src_attn_bias.shape, dtype="int32")
         enc_output = enc_output[active_beams_indice, :, :]
-        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, enc_output
+        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
+            trg_src_attn_post_softmax_shape, enc_output
 
     dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data,
                                    enc_output)
diff --git a/fluid/neural_machine_translation/transformer/model.py b/fluid/neural_machine_translation/transformer/model.py
index ff339c31e5b5ef3fa1ece8c9195ea7e866f14f98..deba195c34f6c71fb4a78f46b4e2678f67feaa75 100644
--- a/fluid/neural_machine_translation/transformer/model.py
+++ b/fluid/neural_machine_translation/transformer/model.py
@@ -29,7 +29,9 @@ def multi_head_attention(queries,
                          d_value,
                          d_model,
                          n_head=1,
-                         dropout_rate=0.):
+                         dropout_rate=0.,
+                         pre_softmax_shape=None,
+                         post_softmax_shape=None):
     """
     Multi-Head Attention. Note that attn_bias is added to the logit before
     computing softmax activiation to mask certain selected positions so that
@@ -109,21 +111,16 @@ def multi_head_attention(queries,
         """
         Scaled Dot-Product Attention
         """
-
-        # FIXME(guosheng): Remove __softmax when softmax_op supporting high
-        # rank tensors. softmax_op only supports 2D tensor currently.
-        # Otherwise, add extra input data to reshape.
-        def __softmax(x, eps=1e-9):
-            exp_out = layers.exp(x=x)
-            sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
-            return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
-
         scaled_q = layers.scale(x=q, scale=d_model**-0.5)
         product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-        weights = __softmax(
-            layers.elementwise_add(
-                x=product, y=attn_bias) if attn_bias else product)
-        # weights = __softmax(product)
+        weights = layers.reshape(
+            x=layers.elementwise_add(
+                x=product, y=attn_bias) if attn_bias else product,
+            shape=[-1, product.shape[-1]],
+            actual_shape=pre_softmax_shape,
+            act="softmax")
+        weights = layers.reshape(
+            x=weights, shape=product.shape, actual_shape=post_softmax_shape)
         if dropout_rate:
             weights = layers.dropout(
                 weights, dropout_prob=dropout_rate, is_test=False)
@@ -248,7 +245,9 @@ def encoder_layer(enc_input,
                   d_value,
                   d_model,
                   d_inner_hid,
-                  dropout_rate=0.):
+                  dropout_rate=0.,
+                  pre_softmax_shape=None,
+                  post_softmax_shape=None):
     """The encoder layers that can be stacked to form a deep encoder.
 
     This module consits of a multi-head (self) attention followed by
@@ -256,9 +255,9 @@ def encoder_layer(enc_input,
     with the post_process_layer to add residual connection, layer normalization
     and droput.
     """
-    attn_output = multi_head_attention(enc_input, enc_input, enc_input,
-                                       attn_bias, d_key, d_value, d_model,
-                                       n_head, dropout_rate)
+    attn_output = multi_head_attention(
+        enc_input, enc_input, enc_input, attn_bias, d_key, d_value, d_model,
+        n_head, dropout_rate, pre_softmax_shape, post_softmax_shape)
     attn_output = post_process_layer(enc_input, attn_output, "dan",
                                      dropout_rate)
     ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model)
@@ -273,7 +272,9 @@ def encoder(enc_input,
             d_value,
             d_model,
             d_inner_hid,
-            dropout_rate=0.):
+            dropout_rate=0.,
+            pre_softmax_shape=None,
+            post_softmax_shape=None):
     """
     The encoder is composed of a stack of identical layers returned by calling
     encoder_layer.
@@ -287,7 +288,9 @@ def encoder(enc_input,
             d_value,
             d_model,
             d_inner_hid,
-            dropout_rate, )
+            dropout_rate,
+            pre_softmax_shape,
+            post_softmax_shape, )
         enc_input = enc_output
     return enc_output
 
@@ -301,7 +304,11 @@ def decoder_layer(dec_input,
                   d_value,
                   d_model,
                   d_inner_hid,
-                  dropout_rate=0.):
+                  dropout_rate=0.,
+                  slf_attn_pre_softmax_shape=None,
+                  slf_attn_post_softmax_shape=None,
+                  src_attn_pre_softmax_shape=None,
+                  src_attn_post_softmax_shape=None):
     """ The layer to be stacked in decoder part.
 
     The structure of this module is similar to that in the encoder part except
@@ -316,7 +323,9 @@ def decoder_layer(dec_input,
         d_value,
         d_model,
         n_head,
-        dropout_rate, )
+        dropout_rate,
+        slf_attn_pre_softmax_shape,
+        slf_attn_post_softmax_shape, )
     slf_attn_output = post_process_layer(
         dec_input,
         slf_attn_output,
@@ -331,7 +340,9 @@ def decoder_layer(dec_input,
         d_value,
         d_model,
         n_head,
-        dropout_rate, )
+        dropout_rate,
+        src_attn_pre_softmax_shape,
+        src_attn_post_softmax_shape, )
     enc_attn_output = post_process_layer(
         slf_attn_output,
         enc_attn_output,
@@ -359,7 +370,11 @@ def decoder(dec_input,
             d_value,
             d_model,
             d_inner_hid,
-            dropout_rate=0.):
+            dropout_rate=0.,
+            slf_attn_pre_softmax_shape=None,
+            slf_attn_post_softmax_shape=None,
+            src_attn_pre_softmax_shape=None,
+            src_attn_post_softmax_shape=None):
     """
     The decoder is composed of a stack of identical decoder_layer layers.
     """
@@ -374,7 +389,11 @@ def decoder(dec_input,
             d_value,
             d_model,
             d_inner_hid,
-            dropout_rate, )
+            dropout_rate,
+            slf_attn_pre_softmax_shape,
+            slf_attn_post_softmax_shape,
+            src_attn_pre_softmax_shape,
+            src_attn_post_softmax_shape, )
         dec_input = dec_output
     return dec_output
 
@@ -383,11 +402,13 @@ def make_inputs(input_data_names,
                 n_head,
                 d_model,
                 max_length,
-                is_pos=True,
-                slf_attn_bias_flag=True,
-                src_attn_bias_flag=True,
+                is_pos,
+                slf_attn_bias_flag,
+                src_attn_bias_flag,
                 enc_output_flag=False,
-                data_shape_flag=True):
+                data_shape_flag=True,
+                slf_attn_shape_flag=True,
+                src_attn_shape_flag=True):
     """
     Define the input data layers for the transformer model.
     """
@@ -425,7 +446,8 @@ def make_inputs(input_data_names,
             append_batch_size=False)
         input_layers += [slf_attn_bias]
     if src_attn_bias_flag:
-        # This input is used to remove attention weights on paddings.
+        # This input is used to remove attention weights on paddings. It's used
+        # in encoder-decoder attention.
         # The actual data shape of slf_attn_bias_flag is:
         # [batch_size, n_head, trg_max_len_in_batch, src_max_len_in_batch]
         src_attn_bias = layers.data(
@@ -435,13 +457,41 @@ def make_inputs(input_data_names,
             append_batch_size=False)
         input_layers += [src_attn_bias]
     if data_shape_flag:
-        # This input is used to reshape.
+        # This input is used to reshape the output of embedding layer.
         data_shape = layers.data(
             name=input_data_names[len(input_layers)],
             shape=[3],
             dtype="int32",
             append_batch_size=False)
         input_layers += [data_shape]
+    if slf_attn_shape_flag:
+        # This shape input is used to reshape before softmax in self attention.
+        slf_attn_pre_softmax_shape = layers.data(
+            name=input_data_names[len(input_layers)],
+            shape=[2],
+            dtype="int32",
+            append_batch_size=False)
+        input_layers += [slf_attn_pre_softmax_shape]
+        # This shape input is used to reshape after softmax in self attention.
+        slf_attn_post_softmax_shape = layers.data(
+            name=input_data_names[len(input_layers)],
+            shape=[4],
+            dtype="int32",
+            append_batch_size=False)
+        input_layers += [slf_attn_post_softmax_shape]
+    if src_attn_shape_flag:
+        src_attn_pre_softmax_shape = layers.data(
+            name=input_data_names[len(input_layers)],
+            shape=[2],
+            dtype="int32",
+            append_batch_size=False)
+        input_layers += [src_attn_pre_softmax_shape]
+        src_attn_post_softmax_shape = layers.data(
+            name=input_data_names[len(input_layers)],
+            shape=[4],
+            dtype="int32",
+            append_batch_size=False)
+        input_layers += [src_attn_post_softmax_shape]
     if enc_output_flag:
         # This input is used in independent decoder program for inference.
         # The actual data shape of slf_attn_bias_flag is:
@@ -452,6 +502,7 @@ def make_inputs(input_data_names,
             dtype="float32",
             append_batch_size=False)
         input_layers += [enc_output]
+
     return input_layers
 
 
@@ -469,8 +520,18 @@ def transformer(
         src_pad_idx,
         trg_pad_idx,
         pos_pad_idx, ):
-    enc_inputs = make_inputs(encoder_input_data_names, n_head, d_model,
-                             max_length, True, True, False)
+    enc_inputs = make_inputs(
+        encoder_input_data_names,
+        n_head,
+        d_model,
+        max_length,
+        is_pos=True,
+        slf_attn_bias_flag=True,
+        src_attn_bias_flag=False,
+        enc_output_flag=False,
+        data_shape_flag=True,
+        slf_attn_shape_flag=True,
+        src_attn_shape_flag=False)
 
     enc_output = wrap_encoder(
         src_vocab_size,
@@ -486,8 +547,18 @@ def transformer(
         pos_pad_idx,
         enc_inputs, )
 
-    dec_inputs = make_inputs(decoder_input_data_names, n_head, d_model,
-                             max_length, True, True, True)
+    dec_inputs = make_inputs(
+        decoder_input_data_names,
+        n_head,
+        d_model,
+        max_length,
+        is_pos=True,
+        slf_attn_bias_flag=True,
+        src_attn_bias_flag=True,
+        enc_output_flag=False,
+        data_shape_flag=True,
+        slf_attn_shape_flag=True,
+        src_attn_shape_flag=True)
 
     predict = wrap_decoder(
         trg_vocab_size,
@@ -506,9 +577,19 @@ def transformer(
 
     # Padding index do not contribute to the total loss. The weights is used to
     # cancel padding index in calculating the loss.
-    gold, weights = make_inputs(label_data_names, n_head, d_model, max_length,
-                                False, False, False, False, False)
-    cost = layers.cross_entropy(input=predict, label=gold)
+    gold, weights = make_inputs(
+        label_data_names,
+        n_head,
+        d_model,
+        max_length,
+        is_pos=False,
+        slf_attn_bias_flag=False,
+        src_attn_bias_flag=False,
+        enc_output_flag=False,
+        data_shape_flag=False,
+        slf_attn_shape_flag=False,
+        src_attn_shape_flag=False)
+    cost = layers.softmax_with_cross_entropy(logits=predict, label=gold)
     weighted_cost = cost * weights
     return layers.reduce_sum(weighted_cost), predict
 
@@ -530,12 +611,24 @@ def wrap_encoder(src_vocab_size,
     """
     if enc_inputs is None:
         # This is used to implement independent encoder program in inference.
-        src_word, src_pos, src_slf_attn_bias, src_data_shape = make_inputs(
-            encoder_input_data_names, n_head, d_model, max_length, True, True,
-            False)
+        src_word, src_pos, src_slf_attn_bias, src_data_shape, \
+            slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
+            make_inputs(
+                encoder_input_data_names,
+                n_head,
+                d_model,
+                max_length,
+                is_pos=True,
+                slf_attn_bias_flag=True,
+                src_attn_bias_flag=False,
+                enc_output_flag=False,
+                data_shape_flag=True,
+                slf_attn_shape_flag=True,
+                src_attn_shape_flag=False)
     else:
-        src_word, src_pos, src_slf_attn_bias, src_data_shape = enc_inputs
-
+        src_word, src_pos, src_slf_attn_bias, src_data_shape, \
+            slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
+            enc_inputs
     enc_input = prepare_encoder(
         src_word,
         src_pos,
@@ -555,7 +648,9 @@ def wrap_encoder(src_vocab_size,
         d_value,
         d_model,
         d_inner_hid,
-        dropout_rate, )
+        dropout_rate,
+        slf_attn_pre_softmax_shape,
+        slf_attn_post_softmax_shape, )
     return enc_output
 
 
@@ -577,11 +672,26 @@ def wrap_decoder(trg_vocab_size,
     """
     if dec_inputs is None:
         # This is used to implement independent decoder program in inference.
-        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, enc_output = make_inputs(
-            decoder_input_data_names, n_head, d_model, max_length, True, True,
-            True, True)
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, slf_attn_pre_softmax_shape, \
+            slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
+            src_attn_post_softmax_shape, enc_output = make_inputs(
+                decoder_input_data_names,
+                n_head,
+                d_model,
+                max_length,
+                is_pos=True,
+                slf_attn_bias_flag=True,
+                src_attn_bias_flag=True,
+                enc_output_flag=True,
+                data_shape_flag=True,
+                slf_attn_shape_flag=True,
+                src_attn_shape_flag=True)
     else:
-        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape = dec_inputs
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, slf_attn_pre_softmax_shape, \
+            slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
+            src_attn_post_softmax_shape = dec_inputs
 
     dec_input = prepare_decoder(
         trg_word,
@@ -604,13 +714,17 @@ def wrap_decoder(trg_vocab_size,
         d_value,
         d_model,
         d_inner_hid,
-        dropout_rate, )
-
+        dropout_rate,
+        slf_attn_pre_softmax_shape,
+        slf_attn_post_softmax_shape,
+        src_attn_pre_softmax_shape,
+        src_attn_post_softmax_shape, )
+    # Return logits for training and probs for inference.
     predict = layers.reshape(
         x=layers.fc(input=dec_output,
                     size=trg_vocab_size,
                     bias_attr=False,
                     num_flatten_dims=2),
         shape=[-1, trg_vocab_size],
-        act="softmax")
+        act="softmax" if dec_inputs is None else None)
     return predict
diff --git a/fluid/neural_machine_translation/transformer/train.py b/fluid/neural_machine_translation/transformer/train.py
index b00552333aed71f6be09c238f4d42456bd9d20ee..83098e9b8720ef9652f4071f37c1fb7af83109c0 100644
--- a/fluid/neural_machine_translation/transformer/train.py
+++ b/fluid/neural_machine_translation/transformer/train.py
@@ -1,7 +1,7 @@
 import os
 import numpy as np
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 from model import transformer, position_encoding_init
@@ -66,16 +66,35 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
         [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True)
     trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
                                 [1, 1, trg_max_len, 1]).astype("float32")
+
+    # These shape tensors are used in reshape_op.
+    src_data_shape = np.array([len(insts), src_max_len, d_model], dtype="int32")
+    trg_data_shape = np.array([len(insts), trg_max_len, d_model], dtype="int32")
+    src_slf_attn_pre_softmax_shape = np.array(
+        [-1, src_slf_attn_bias.shape[-1]], dtype="int32")
+    src_slf_attn_post_softmax_shape = np.array(
+        src_slf_attn_bias.shape, dtype="int32")
+    trg_slf_attn_pre_softmax_shape = np.array(
+        [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
+    trg_slf_attn_post_softmax_shape = np.array(
+        trg_slf_attn_bias.shape, dtype="int32")
+    trg_src_attn_pre_softmax_shape = np.array(
+        [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
+    trg_src_attn_post_softmax_shape = np.array(
+        trg_src_attn_bias.shape, dtype="int32")
+
     lbl_word = pad_batch_data([inst[2] for inst in insts], trg_pad_idx, n_head,
                               False, False, False, False)
     lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
-    src_data_shape = np.array([len(insts), src_max_len, d_model], dtype="int32")
-    trg_data_shape = np.array([len(insts), trg_max_len, d_model], dtype="int32")
+
     input_dict = dict(
         zip(input_data_names, [
-            src_word, src_pos, src_slf_attn_bias, src_data_shape, trg_word,
-            trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape,
-            lbl_word, lbl_weight
+            src_word, src_pos, src_slf_attn_bias, src_data_shape,
+            src_slf_attn_pre_softmax_shape, src_slf_attn_post_softmax_shape,
+            trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias,
+            trg_data_shape, trg_slf_attn_pre_softmax_shape,
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape,
+            trg_src_attn_post_softmax_shape, lbl_word, lbl_weight
         ]))
     return input_dict
 
@@ -122,6 +141,10 @@ def main():
     def test(exe):
         test_costs = []
         for batch_id, data in enumerate(val_data()):
+            if len(data) != TrainTaskConfig.batch_size:
+                # Since we use the sum cost, keep comparable cost by fixing the
+                # batch size. Remove this if the cost is mean.
+                continue
             data_input = prepare_batch_input(
                 data, encoder_input_data_names + decoder_input_data_names[:-1] +
                 label_data_names, ModelHyperParams.src_pad_idx,
diff --git a/fluid/object_detection/image_util.py b/fluid/object_detection/image_util.py
index ba8744eda0a078acd38cad9b10ca7511185efc43..e538449aa9f9ce15e7730de293a98fe753403a87 100644
--- a/fluid/object_detection/image_util.py
+++ b/fluid/object_detection/image_util.py
@@ -1,4 +1,4 @@
-from PIL import Image
+from PIL import Image, ImageEnhance
 import numpy as np
 import random
 import math
@@ -159,3 +159,77 @@ def crop_image(img, bbox_labels, sample_bbox, image_width, image_height):
     sample_img = img[ymin:ymax, xmin:xmax]
     sample_labels = transform_labels(bbox_labels, sample_bbox)
     return sample_img, sample_labels
+
+
+def random_brightness(img, settings):
+    prob = random.uniform(0, 1)
+    if prob < settings._brightness_prob:
+        delta = random.uniform(-settings._brightness_delta,
+                               settings._brightness_delta) + 1
+        img = ImageEnhance.Brightness(img).enhance(delta)
+    return img
+
+
+def random_contrast(img, settings):
+    prob = random.uniform(0, 1)
+    if prob < settings._contrast_prob:
+        delta = random.uniform(-settings._contrast_delta,
+                               settings._contrast_delta) + 1
+        img = ImageEnhance.Contrast(img).enhance(delta)
+    return img
+
+
+def random_saturation(img, settings):
+    prob = random.uniform(0, 1)
+    if prob < settings._saturation_prob:
+        delta = random.uniform(-settings._saturation_delta,
+                               settings._saturation_delta) + 1
+        img = ImageEnhance.Color(img).enhance(delta)
+    return img
+
+
+def random_hue(img, settings):
+    prob = random.uniform(0, 1)
+    if prob < settings._hue_prob:
+        delta = random.uniform(-settings._hue_delta, settings._hue_delta)
+        img_hsv = np.array(img.convert('HSV'))
+        img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta
+        img = Image.fromarray(img_hsv, mode='HSV').convert('RGB')
+    return img
+
+
+def distort_image(img, settings):
+    prob = random.uniform(0, 1)
+    # Apply different distort order
+    if prob > 0.5:
+        img = random_brightness(img, settings)
+        img = random_contrast(img, settings)
+        img = random_saturation(img, settings)
+        img = random_hue(img, settings)
+    else:
+        img = random_brightness(img, settings)
+        img = random_saturation(img, settings)
+        img = random_hue(img, settings)
+        img = random_contrast(img, settings)
+    return img
+
+
+def expand_image(img, bbox_labels, img_width, img_height, settings):
+    prob = random.uniform(0, 1)
+    if prob < settings._expand_prob:
+        expand_ratio = random.uniform(1, settings._expand_max_ratio)
+        if expand_ratio - 1 >= 0.01:
+            height = int(img_height * expand_ratio)
+            width = int(img_width * expand_ratio)
+            h_off = math.floor(random.uniform(0, height - img_height))
+            w_off = math.floor(random.uniform(0, width - img_width))
+            expand_bbox = bbox(-w_off / img_width, -h_off / img_height,
+                               (width - w_off) / img_width,
+                               (height - h_off) / img_height)
+            expand_img = np.ones((height, width, 3))
+            expand_img = np.uint8(expand_img * np.squeeze(settings._img_mean))
+            expand_img = Image.fromarray(expand_img)
+            expand_img.paste(img, (int(w_off), int(h_off)))
+            bbox_labels = transform_labels(bbox_labels, expand_bbox)
+            return expand_img, bbox_labels
+    return img, bbox_labels
diff --git a/fluid/object_detection/reader.py b/fluid/object_detection/reader.py
index 4e680c29997b432c14b92ea641aa9f956de41292..6a6beb6e50f5b0a7f6b969ca53868178db2527a6 100644
--- a/fluid/object_detection/reader.py
+++ b/fluid/object_detection/reader.py
@@ -22,17 +22,38 @@ import os
 
 
 class Settings(object):
-    def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value):
+    def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value,
+                 apply_distort, apply_expand):
         self._data_dir = data_dir
         self._label_list = []
         label_fpath = os.path.join(data_dir, label_file)
         for line in open(label_fpath):
             self._label_list.append(line.strip())
 
+        self._apply_distort = apply_distort
+        self._apply_expand = apply_expand
         self._resize_height = resize_h
         self._resize_width = resize_w
         self._img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype(
             'float32')
+        self._expand_prob = 0.5
+        self._expand_max_ratio = 4
+        self._hue_prob = 0.5
+        self._hue_delta = 18
+        self._contrast_prob = 0.5
+        self._contrast_delta = 0.5
+        self._saturation_prob = 0.5
+        self._saturation_delta = 0.5
+        self._brightness_prob = 0.5
+        self._brightness_delta = 0.125
+
+    @property
+    def apply_distort(self):
+        return self._apply_expand
+
+    @property
+    def apply_distort(self):
+        return self._apply_distort
 
     @property
     def data_dir(self):
@@ -71,7 +92,6 @@ def _reader_creator(settings, file_list, mode, shuffle):
 
                 img = Image.open(img_path)
                 img_width, img_height = img.size
-                img = np.array(img)
 
                 # layout: label | xmin | ymin | xmax | ymax | difficult
                 if mode == 'train' or mode == 'test':
@@ -99,6 +119,12 @@ def _reader_creator(settings, file_list, mode, shuffle):
 
                     sample_labels = bbox_labels
                     if mode == 'train':
+                        if settings._apply_distort:
+                            img = image_util.distort_image(img, settings)
+                        if settings._apply_expand:
+                            img, bbox_labels = image_util.expand_image(
+                                img, bbox_labels, img_width, img_height,
+                                settings)
                         batch_sampler = []
                         # hard-code here
                         batch_sampler.append(
@@ -126,13 +152,14 @@ def _reader_creator(settings, file_list, mode, shuffle):
                         sampled_bbox = image_util.generate_batch_samples(
                             batch_sampler, bbox_labels, img_width, img_height)
 
+                        img = np.array(img)
                         if len(sampled_bbox) > 0:
                             idx = int(random.uniform(0, len(sampled_bbox)))
                             img, sample_labels = image_util.crop_image(
                                 img, bbox_labels, sampled_bbox[idx], img_width,
                                 img_height)
 
-                img = Image.fromarray(img)
+                        img = Image.fromarray(img)
                 img = img.resize((settings.resize_w, settings.resize_h),
                                  Image.ANTIALIAS)
                 img = np.array(img)
diff --git a/fluid/object_detection/train.py b/fluid/object_detection/train.py
index ed826a8eebaf9487c4b7e7e28427beaf7760e101..a6c8e9e273cb2e08fc789acfdf9f92cb4e70f341 100644
--- a/fluid/object_detection/train.py
+++ b/fluid/object_detection/train.py
@@ -12,8 +12,9 @@ import functools
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('parallel',    bool,   True,     "Whether use parallel training.")
-add_arg('use_gpu',     bool,   True,     "Whether use GPU.")
+add_arg('batch_size',   int,    32,       "Minibatch size.")
+add_arg('parallel',     bool,   True,     "Whether use parallel training.")
+add_arg('use_gpu',      bool,   True,     "Whether use GPU.")
 # yapf: disable
 
 
@@ -47,26 +48,24 @@ def train(args,
             locs, confs, box, box_var = mobile_net(image_, image_shape)
             loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_,
                                          box, box_var)
+            nmsed_out = fluid.layers.detection_output(
+                locs, confs, box, box_var, nms_threshold=0.45)
+            loss = fluid.layers.reduce_sum(loss)
             pd.write_output(loss)
-            pd.write_output(locs)
-            pd.write_output(confs)
-            pd.write_output(box)
-            pd.write_output(box_var)
+            pd.write_output(nmsed_out)
 
-        loss, locs, confs, box, box_var = pd()
-        loss = fluid.layers.reduce_sum(loss)
+        loss, nmsed_out = pd()
+        loss = fluid.layers.mean(loss)
     else:
         locs, confs, box, box_var = mobile_net(image, image_shape)
         nmsed_out = fluid.layers.detection_output(
-            locs, mbox_confs, box, box_var, nms_threshold=0.45)
-        loss = fluid.layers.ssd_loss(locs, mbox_confs, gt_box, gt_label,
+            locs, confs, box, box_var, nms_threshold=0.45)
+        loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label,
                                      box, box_var)
         loss = fluid.layers.reduce_sum(loss)
 
     test_program = fluid.default_main_program().clone(for_test=True)
     with fluid.program_guard(test_program):
-        nmsed_out = fluid.layers.detection_output(
-            locs, confs, box, box_var, nms_threshold=0.45)
         map_eval = fluid.evaluator.DetectionMAP(
             nmsed_out,
             gt_label,
@@ -77,13 +76,10 @@ def train(args,
             evaluate_difficult=False,
             ap_version='11point')
 
-    optimizer = fluid.optimizer.Momentum(
-        learning_rate=fluid.layers.exponential_decay(
-            learning_rate=learning_rate,
-            decay_steps=40000,
-            decay_rate=0.1,
-            staircase=True),
-        momentum=0.9,
+    boundaries = [40000, 60000]
+    values = [0.001, 0.0005, 0.00025]
+    optimizer = fluid.optimizer.RMSProp(
+        learning_rate=fluid.layers.piecewise_decay(boundaries, values),
         regularization=fluid.regularizer.L2Decay(0.00005), )
 
     optimizer.minimize(loss)
@@ -92,7 +88,8 @@ def train(args,
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
 
-    load_model.load_paddlev1_vars(place)
+    load_model.load_and_set_vars(place)
+    #load_model.load_paddlev1_vars(place)
     train_reader = paddle.batch(
         reader.train(data_args, train_file_list), batch_size=batch_size)
     test_reader = paddle.batch(
@@ -100,7 +97,6 @@ def train(args,
     feeder = fluid.DataFeeder(
         place=place, feed_list=[image, gt_box, gt_label, difficult])
 
-    #print 'test_program ', test_program
     def test(pass_id):
         _, accum_map = map_eval.get_map_var()
         map_eval.reset(exe)
@@ -111,14 +107,14 @@ def train(args,
                                fetch_list=[accum_map])
         print("Test {0}, map {1}".format(pass_id, test_map[0]))
 
-    #print 'main_program ', fluid.default_main_program()
     for pass_id in range(num_passes):
         for batch_id, data in enumerate(train_reader()):
             loss_v = exe.run(fluid.default_main_program(),
                              feed=feeder.feed(data),
                              fetch_list=[loss])
-            print("Pass {0}, batch {1}, loss {2}"
-                  .format(pass_id, batch_id, loss_v[0]))
+            if batch_id % 20 == 0:
+                print("Pass {0}, batch {1}, loss {2}"
+                      .format(pass_id, batch_id, loss_v[0]))
         test(pass_id)
 
         if pass_id % 10 == 0:
@@ -134,6 +130,8 @@ if __name__ == '__main__':
     data_args = reader.Settings(
         data_dir='./data',
         label_file='label_list',
+        apply_distort=True,
+        apply_expand=True,
         resize_h=300,
         resize_w=300,
         mean_value=[127.5, 127.5, 127.5])
@@ -142,5 +140,5 @@ if __name__ == '__main__':
           val_file_list='./data/test.txt',
           data_args=data_args,
           learning_rate=0.001,
-          batch_size=32,
+          batch_size=args.batch_size,
           num_passes=300)
diff --git a/fluid/ocr_recognition/crnn_ctc_model.py b/fluid/ocr_recognition/crnn_ctc_model.py
index 73616ecb36ca2661eb8e4898caf34fc2d91b9bdc..dd5aaa3f94c1e2668ec75d30735640d14ee8ef0e 100644
--- a/fluid/ocr_recognition/crnn_ctc_model.py
+++ b/fluid/ocr_recognition/crnn_ctc_model.py
@@ -187,15 +187,18 @@ def ctc_train_net(images, label, args, num_classes):
     error_evaluator = fluid.evaluator.EditDistance(
         input=decoded_out, label=casted_label)
 
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(error_evaluator)
+    inference_program = fluid.default_main_program().clone(for_test=True)
 
     optimizer = fluid.optimizer.Momentum(
         learning_rate=args.learning_rate, momentum=args.momentum)
     _, params_grads = optimizer.minimize(sum_cost)
+    model_average = fluid.optimizer.ModelAverage(
+        params_grads,
+        args.average_window,
+        min_average_window=args.min_average_window,
+        max_average_window=args.max_average_window)
 
-    return sum_cost, error_evaluator, inference_program
+    return sum_cost, error_evaluator, inference_program, model_average
 
 
 def ctc_infer(images, num_classes):
diff --git a/fluid/ocr_recognition/ctc_reader.py b/fluid/ocr_recognition/ctc_reader.py
index f095c9b3bb7cdf36c247cca1c93ea7d636b91d24..c9f75a0d523a7390b3814706cdad831d5900dbdb 100644
--- a/fluid/ocr_recognition/ctc_reader.py
+++ b/fluid/ocr_recognition/ctc_reader.py
@@ -1,14 +1,25 @@
 import os
 import cv2
+import tarfile
 import numpy as np
 from PIL import Image
-
+from os import path
 from paddle.v2.image import load_image
 import paddle.v2 as paddle
 
 NUM_CLASSES = 10784
 DATA_SHAPE = [1, 48, 512]
 
+DATA_MD5 = "1de60d54d19632022144e4e58c2637b5"
+DATA_URL = "http://cloud.dlnel.org/filepub/?uuid=df937251-3c0b-480d-9a7b-0080dfeee65c"
+CACHE_DIR_NAME = "ctc_data"
+SAVED_FILE_NAME = "data.tar.gz"
+DATA_DIR_NAME = "data"
+TRAIN_DATA_DIR_NAME = "train_images"
+TEST_DATA_DIR_NAME = "test_images"
+TRAIN_LIST_FILE_NAME = "train.list"
+TEST_LIST_FILE_NAME = "test.list"
+
 
 class DataGenerator(object):
     def __init__(self):
@@ -102,25 +113,42 @@ class DataGenerator(object):
 
 
 def num_classes():
+    '''Get classes number of this dataset.
+    '''
     return NUM_CLASSES
 
 
 def data_shape():
+    '''Get image shape of this dataset. It is a dummy shape for this dataset.
+    '''
     return DATA_SHAPE
 
 
 def train(batch_size):
     generator = DataGenerator()
+    data_dir = download_data()
     return generator.train_reader(
-        "/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/train_images/",
-        "/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/train.list",
-        batch_size)
+        path.join(data_dir, TRAIN_DATA_DIR_NAME),
+        path.join(data_dir, TRAIN_LIST_FILE_NAME), batch_size)
 
 
 def test(batch_size=1):
     generator = DataGenerator()
+    data_dir = download_data()
     return paddle.batch(
         generator.test_reader(
-            "/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/test_images/",
-            "/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/test.list"
-        ), batch_size)
+            path.join(data_dir, TRAIN_DATA_DIR_NAME),
+            path.join(data_dir, TRAIN_LIST_FILE_NAME)), batch_size)
+
+
+def download_data():
+    '''Download train and test data.
+    '''
+    tar_file = paddle.dataset.common.download(
+        DATA_URL, CACHE_DIR_NAME, DATA_MD5, save_name=SAVED_FILE_NAME)
+    data_dir = path.join(path.dirname(tar_file), DATA_DIR_NAME)
+    if not path.isdir(data_dir):
+        t = tarfile.open(tar_file, "r:gz")
+        t.extractall(path=path.dirname(tar_file))
+        t.close()
+    return data_dir
diff --git a/fluid/ocr_recognition/ctc_train.py b/fluid/ocr_recognition/ctc_train.py
index c2d8fd26bbdeb3ad5c9fb2c1ade3b2b22a0dfd44..2ac23f609779c5e653919fece6dbf661c79e859f 100644
--- a/fluid/ocr_recognition/ctc_train.py
+++ b/fluid/ocr_recognition/ctc_train.py
@@ -8,6 +8,7 @@ import functools
 import sys
 from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
 from crnn_ctc_model import ctc_train_net
+import time
 
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
@@ -23,7 +24,10 @@ add_arg('momentum',       float, 0.9,    "Momentum.")
 add_arg('rnn_hidden_size',int,   200,    "Hidden size of rnn layers.")
 add_arg('device',         int,   0,      "Device id.'-1' means running on CPU"
                                          "while '0' means GPU-0.")
-add_arg('parallel',     bool,   True,     "Whether use parallel training.")
+add_arg('min_average_window',     int,   10000,     "Min average window.")
+add_arg('max_average_window',     int,   15625,     "Max average window.")
+add_arg('average_window',     float,   0.15,     "Average window.")
+add_arg('parallel',     bool,   False,     "Whether use parallel training.")
 # yapf: disable
 
 def load_parameter(place):
@@ -40,7 +44,7 @@ def train(args, data_reader=dummy_reader):
     # define network
     images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int32', lod_level=1)
-    sum_cost, error_evaluator, inference_program = ctc_train_net(images, label, args, num_classes)
+    sum_cost, error_evaluator, inference_program, model_average = ctc_train_net(images, label, args, num_classes)
 
     # data reader
     train_reader = data_reader.train(args.batch_size)
@@ -66,21 +70,24 @@ def train(args, data_reader=dummy_reader):
                 fetch_list=[sum_cost] + error_evaluator.metrics)
             total_loss += batch_loss[0]
             total_seq_error += batch_seq_error[0]
-            if batch_id % 10 == 1:
+            if batch_id % 100 == 1:
                 print '.',
                 sys.stdout.flush()
             if batch_id % args.log_period == 1:
-                print "\nPass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq error: %s." % (
+                print "\nTime: %s; Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq error: %s." % (
+                    time.time(),
                     pass_id, batch_id, total_loss / (batch_id * args.batch_size), total_seq_error / (batch_id * args.batch_size))
                 sys.stdout.flush()
             batch_id += 1
 
-        error_evaluator.reset(exe)
-        for data in test_reader():
-            exe.run(inference_program, feed=get_feeder_data(data, place))
-        _, test_seq_error = error_evaluator.eval(exe)
-        print "\nEnd pass[%d]; Test seq error: %s.\n" % (
-            pass_id, str(test_seq_error[0]))
+        with model_average.apply(exe):
+            error_evaluator.reset(exe)
+            for data in test_reader():
+                exe.run(inference_program, feed=get_feeder_data(data, place))
+            _, test_seq_error = error_evaluator.eval(exe)
+
+            print "\nEnd pass[%d]; Test seq error: %s.\n" % (
+                pass_id, str(test_seq_error[0]))
 
 def main():
     args = parser.parse_args()