diff --git a/.gitignore b/.gitignore
index dde3895fc112ad34a839b2fed9210ac2288a959b..9492cff0cb9500079955856eedac883e39b522a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .DS_Store
 *.pyc
+.*~
diff --git a/.travis.yml b/.travis.yml
index 7fa098a7c5d8ecdc5d5ea38fe38c3ddae959498a..ecc348e1482fac430f9d98990b8940ab57b2b75b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,7 +17,7 @@ addons:
       - python-pip
       - python2.7-dev
       - clang-format-3.8
-  ssh_known_hosts: 52.76.173.135
+  ssh_known_hosts: 13.229.163.131
 before_install:
   - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
   - sudo pip install -U virtualenv pre-commit pip
diff --git a/fluid/DeepASR/data_utils/async_data_reader.py b/fluid/DeepASR/data_utils/async_data_reader.py
index 03448fadccfbcfb67ab28cdf2071fc4b743ef6e5..731c55de71e8d4b7db156f1ae72172c36eb1be7a 100644
--- a/fluid/DeepASR/data_utils/async_data_reader.py
+++ b/fluid/DeepASR/data_utils/async_data_reader.py
@@ -15,9 +15,7 @@ from multiprocessing import Manager, Process
 import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
 import data_utils.augmentor.trans_add_delta as trans_add_delta
 from data_utils.util import suppress_complaints, suppress_signal
-from data_utils.util import SharedNDArray, SharedMemoryPoolManager
-from data_utils.util import DaemonProcessGroup, batch_to_ndarray
-from data_utils.util import CriticalException, ForceExitWrapper, EpochEndSignal
+from data_utils.util import CriticalException, ForceExitWrapper
 
 
 class SampleInfo(object):
@@ -32,11 +30,12 @@ class SampleInfo(object):
         label_bin_path (str): File containing the label data.
         label_size (int): Byte count of the sample's label data.
         label_frame_num (int): Label number of the sample.
+        sample_name (str): Key of the sample
     """
 
     def __init__(self, feature_bin_path, feature_start, feature_size,
                  feature_frame_num, feature_dim, label_bin_path, label_start,
-                 label_size, label_frame_num):
+                 label_size, label_frame_num, sample_name):
         self.feature_bin_path = feature_bin_path
         self.feature_start = feature_start
         self.feature_size = feature_size
@@ -47,6 +46,7 @@ class SampleInfo(object):
         self.label_start = label_start
         self.label_size = label_size
         self.label_frame_num = label_frame_num
+        self.sample_name = sample_name
 
 
 class SampleInfoBucket(object):
@@ -69,8 +69,8 @@ class SampleInfoBucket(object):
         split_sentence_threshold(int): Sentence whose length larger than
                                 the value will trigger split operation.
         split_sub_sentence_len(int): sub-sentence length is equal to
-                                    (split_sub_sentence_len + \
-                                     rand() % split_perturb).
+                                    (split_sub_sentence_len
+                                     + rand() % split_perturb).
     """
 
     def __init__(self,
@@ -104,24 +104,33 @@ class SampleInfoBucket(object):
             feature_bin_path = self._feature_bin_paths[block_idx]
             feature_desc_path = self._feature_desc_paths[block_idx]
 
-            label_desc_lines = open(label_desc_path).readlines()
             feature_desc_lines = open(feature_desc_path).readlines()
 
-            sample_num = int(label_desc_lines[0].split()[1])
-            assert sample_num == int(feature_desc_lines[0].split()[1])
+            label_desc_lines = []
+            if label_desc_path != "":
+                label_desc_lines = open(label_desc_path).readlines()
+            sample_num = int(feature_desc_lines[0].split()[1])
+
+            if label_desc_path != "":
+                assert sample_num == int(label_desc_lines[0].split()[1])
 
             for i in xrange(sample_num):
                 feature_desc_split = feature_desc_lines[i + 1].split()
+                sample_name = feature_desc_split[0]
                 feature_start = int(feature_desc_split[2])
                 feature_size = int(feature_desc_split[3])
                 feature_frame_num = int(feature_desc_split[4])
                 feature_dim = int(feature_desc_split[5])
 
-                label_desc_split = label_desc_lines[i + 1].split()
-                label_start = int(label_desc_split[2])
-                label_size = int(label_desc_split[3])
-                label_frame_num = int(label_desc_split[4])
-                assert feature_frame_num == label_frame_num
+                label_start = -1
+                label_size = -1
+                label_frame_num = feature_frame_num
+                if label_desc_path != "":
+                    label_desc_split = label_desc_lines[i + 1].split()
+                    label_start = int(label_desc_split[2])
+                    label_size = int(label_desc_split[3])
+                    label_frame_num = int(label_desc_split[4])
+                    assert feature_frame_num == label_frame_num
 
                 if self._split_sentence_threshold == -1 or \
                         self._split_perturb == -1 or \
@@ -131,7 +140,7 @@ class SampleInfoBucket(object):
                         SampleInfo(feature_bin_path, feature_start,
                                    feature_size, feature_frame_num, feature_dim,
                                    label_bin_path, label_start, label_size,
-                                   label_frame_num))
+                                   label_frame_num, sample_name))
                 #split sentence
                 else:
                     cur_frame_pos = 0
@@ -152,16 +161,19 @@ class SampleInfoBucket(object):
                                 * feature_dim * 4, cur_frame_len * feature_dim *
                                 4, cur_frame_len, feature_dim, label_bin_path,
                                 label_start + cur_frame_pos * 4, cur_frame_len *
-                                4, cur_frame_len))
+                                4, cur_frame_len, sample_name))
 
                         remain_frame_num -= cur_frame_len
                         cur_frame_pos += cur_frame_len
                         if remain_frame_num <= 0:
                             break
-
         return sample_info_list
 
 
+class EpochEndSignal():
+    pass
+
+
 class AsyncDataReader(object):
     """DataReader provides basic audio sample preprocessing pipeline including
     data loading and data augmentation.
@@ -190,7 +202,7 @@ class AsyncDataReader(object):
 
     def __init__(self,
                  feature_file_list,
-                 label_file_list,
+                 label_file_list="",
                  drop_frame_len=512,
                  proc_num=10,
                  sample_buffer_size=1024,
@@ -213,25 +225,30 @@ class AsyncDataReader(object):
         self._sample_info_buffer_size = sample_info_buffer_size
         self._batch_buffer_size = batch_buffer_size
         self._proc_num = proc_num
-        if self._proc_num <= 2:
-            raise ValueError("Value of `proc_num` should be greater than 2.")
-        self._sample_proc_num = self._proc_num - 2
         self._verbose = verbose
         self._force_exit = ForceExitWrapper(self._manager.Value('b', False))
 
     def generate_bucket_list(self, is_shuffle):
         if self._block_info_list is None:
             block_feature_info_lines = open(self._feature_file_list).readlines()
-            block_label_info_lines = open(self._label_file_list).readlines()
-            assert len(block_feature_info_lines) == len(block_label_info_lines)
             self._block_info_list = []
-            for i in xrange(0, len(block_feature_info_lines), 2):
-                block_info = (block_feature_info_lines[i],
-                              block_feature_info_lines[i + 1],
-                              block_label_info_lines[i],
-                              block_label_info_lines[i + 1])
-                self._block_info_list.append(
-                    map(lambda line: line.strip(), block_info))
+            if self._label_file_list != "":
+                block_label_info_lines = open(self._label_file_list).readlines()
+                assert len(block_feature_info_lines) == len(
+                    block_label_info_lines)
+                for i in xrange(0, len(block_feature_info_lines), 2):
+                    block_info = (block_feature_info_lines[i],
+                                  block_feature_info_lines[i + 1],
+                                  block_label_info_lines[i],
+                                  block_label_info_lines[i + 1])
+                    self._block_info_list.append(
+                        map(lambda line: line.strip(), block_info))
+            else:
+                for i in xrange(0, len(block_feature_info_lines), 2):
+                    block_info = (block_feature_info_lines[i],
+                                  block_feature_info_lines[i + 1], "", "")
+                    self._block_info_list.append(
+                        map(lambda line: line.strip(), block_info))
 
         if is_shuffle:
             self._rng.shuffle(self._block_info_list)
@@ -251,23 +268,13 @@ class AsyncDataReader(object):
     def set_transformers(self, transformers):
         self._transformers = transformers
 
-    def recycle(self, *args):
-        for shared_ndarray in args:
-            if not isinstance(shared_ndarray, SharedNDArray):
-                raise Value("Only support recycle SharedNDArray object.")
-            shared_ndarray.recycle(self._pool_manager.pool)
-
-    def _start_async_processing(self):
+    def _sample_generator(self):
         sample_info_queue = self._manager.Queue(self._sample_info_buffer_size)
         sample_queue = self._manager.Queue(self._sample_buffer_size)
         self._order_id = 0
 
         @suppress_complaints(verbose=self._verbose, notify=self._force_exit)
         def ordered_feeding_task(sample_info_queue):
-            if self._verbose == 0:
-                signal.signal(signal.SIGTERM, suppress_signal)
-                signal.signal(signal.SIGINT, suppress_signal)
-
             for sample_info_bucket in self._bucket_list:
                 try:
                     sample_info_list = \
@@ -280,12 +287,13 @@ class AsyncDataReader(object):
                         sample_info_queue.put((sample_info, self._order_id))
                         self._order_id += 1
 
-            for i in xrange(self._sample_proc_num):
+            for i in xrange(self._proc_num):
                 sample_info_queue.put(EpochEndSignal())
 
-        feeding_proc = DaemonProcessGroup(
-            proc_num=1, target=ordered_feeding_task, args=(sample_info_queue, ))
-        feeding_proc.start_all()
+        feeding_thread = Thread(
+            target=ordered_feeding_task, args=(sample_info_queue, ))
+        feeding_thread.daemon = True
+        feeding_thread.start()
 
         @suppress_complaints(verbose=self._verbose, notify=self._force_exit)
         def ordered_processing_task(sample_info_queue, sample_queue, out_order):
@@ -313,25 +321,32 @@ class AsyncDataReader(object):
                                            sample_info.feature_size)
 
                 assert sample_info.feature_frame_num \
-                       * sample_info.feature_dim * 4 == len(feature_bytes), \
-                       (sample_info.feature_bin_path,
-                        sample_info.feature_frame_num,
-                        sample_info.feature_dim,
-                        len(feature_bytes))
-
-                label_bytes = read_bytes(sample_info.label_bin_path,
-                                         sample_info.label_start,
-                                         sample_info.label_size)
-
-                assert sample_info.label_frame_num * 4 == len(label_bytes), (
-                    sample_info.label_bin_path, sample_info.label_array,
-                    len(label_bytes))
-
-                label_array = struct.unpack('I' * sample_info.label_frame_num,
-                                            label_bytes)
-                label_data = np.array(
-                    label_array, dtype='int64').reshape(
-                        (sample_info.label_frame_num, 1))
+                       * sample_info.feature_dim * 4 \
+                        == len(feature_bytes), \
+                        (sample_info.feature_bin_path,
+                         sample_info.feature_frame_num,
+                         sample_info.feature_dim,
+                         len(feature_bytes))
+
+                label_data = None
+                if sample_info.label_bin_path != "":
+                    label_bytes = read_bytes(sample_info.label_bin_path,
+                                             sample_info.label_start,
+                                             sample_info.label_size)
+
+                    assert sample_info.label_frame_num * 4 == len(
+                        label_bytes), (sample_info.label_bin_path,
+                                       sample_info.label_array,
+                                       len(label_bytes))
+
+                    label_array = struct.unpack(
+                        'I' * sample_info.label_frame_num, label_bytes)
+                    label_data = np.array(
+                        label_array, dtype='int64').reshape(
+                            (sample_info.label_frame_num, 1))
+                else:
+                    label_data = np.zeros(
+                        (sample_info.label_frame_num, 1), dtype='int64')
 
                 feature_frame_num = sample_info.feature_frame_num
                 feature_dim = sample_info.feature_dim
@@ -341,12 +356,11 @@ class AsyncDataReader(object):
                 feature_data = np.array(
                     feature_array, dtype='float32').reshape((
                         sample_info.feature_frame_num, sample_info.feature_dim))
-
-                sample_data = (feature_data, label_data)
+                sample_data = (feature_data, label_data,
+                               sample_info.sample_name)
                 for transformer in self._transformers:
                     # @TODO(pkuyym) to make transfomer only accept feature_data
                     sample_data = transformer.perform_trans(sample_data)
-
                 while order_id != out_order[0]:
                     time.sleep(0.001)
 
@@ -362,74 +376,77 @@ class AsyncDataReader(object):
 
         out_order = self._manager.list([0])
         args = (sample_info_queue, sample_queue, out_order)
-        sample_proc = DaemonProcessGroup(
-            proc_num=self._sample_proc_num,
-            target=ordered_processing_task,
-            args=args)
-        sample_proc.start_all()
+        workers = [
+            Process(
+                target=ordered_processing_task, args=args)
+            for _ in xrange(self._proc_num)
+        ]
 
-        return sample_queue
+        for w in workers:
+            w.daemon = True
+            w.start()
 
-    def batch_iterator(self, batch_size, minimum_batch_size):
-        @suppress_complaints(verbose=self._verbose, notify=self._force_exit)
-        def batch_assembling_task(sample_queue, batch_queue, pool):
-            def conv_to_shared(ndarray):
-                while self._force_exit == False:
-                    try:
-                        (name, shared_ndarray) = pool.popitem()
-                    except Exception as e:
-                        time.sleep(0.001)
+        finished_proc_num = 0
+
+        while self._force_exit == False:
+            try:
+                sample = sample_queue.get_nowait()
+            except Queue.Empty:
+                time.sleep(0.001)
+            else:
+                if isinstance(sample, EpochEndSignal):
+                    finished_proc_num += 1
+                    if finished_proc_num >= self._proc_num:
+                        break
                     else:
-                        shared_ndarray.copy(ndarray)
-                        return shared_ndarray
+                        continue
 
-            if self._verbose == 0:
-                signal.signal(signal.SIGTERM, suppress_signal)
-                signal.signal(signal.SIGINT, suppress_signal)
+                yield sample
+
+    def batch_iterator(self, batch_size, minimum_batch_size):
+        def batch_to_ndarray(batch_samples, lod):
+            assert len(batch_samples)
+            frame_dim = batch_samples[0][0].shape[1]
+            batch_feature = np.zeros((lod[-1], frame_dim), dtype="float32")
+            batch_label = np.zeros((lod[-1], 1), dtype="int64")
+            start = 0
+            name_lst = []
+            for sample in batch_samples:
+                frame_num = sample[0].shape[0]
+                batch_feature[start:start + frame_num, :] = sample[0]
+                batch_label[start:start + frame_num, :] = sample[1]
+                start += frame_num
+                name_lst.append(sample[2])
+            return (batch_feature, batch_label, name_lst)
 
+        @suppress_complaints(verbose=self._verbose, notify=self._force_exit)
+        def batch_assembling_task(sample_generator, batch_queue):
             batch_samples = []
             lod = [0]
-            done_num = 0
-            while done_num < self._sample_proc_num:
-                sample = sample_queue.get()
-                if isinstance(sample, EpochEndSignal):
-                    done_num += 1
-                else:
-                    batch_samples.append(sample)
-                    lod.append(lod[-1] + sample[0].shape[0])
-                    if len(batch_samples) == batch_size:
-                        feature, label = batch_to_ndarray(batch_samples, lod)
-
-                        feature = conv_to_shared(feature)
-                        label = conv_to_shared(label)
-                        lod = conv_to_shared(np.array(lod).astype('int64'))
-
-                        batch_queue.put((feature, label, lod))
-                        batch_samples = []
-                        lod = [0]
+            for sample in sample_generator():
+                batch_samples.append(sample)
+                lod.append(lod[-1] + sample[0].shape[0])
+                if len(batch_samples) == batch_size:
+                    (batch_feature, batch_label, name_lst) = batch_to_ndarray(
+                        batch_samples, lod)
+                    batch_queue.put((batch_feature, batch_label, lod, name_lst))
+                    batch_samples = []
+                    lod = [0]
 
             if len(batch_samples) >= minimum_batch_size:
-                (feature, label) = batch_to_ndarray(batch_samples, lod)
-
-                feature = conv_to_shared(feature)
-                label = conv_to_shared(label)
-                lod = conv_to_shared(np.array(lod).astype('int64'))
-
-                batch_queue.put((feature, label, lod))
+                (batch_feature, batch_label, name_lst) = batch_to_ndarray(
+                    batch_samples, lod)
+                batch_queue.put((batch_feature, batch_label, lod, name_lst))
 
             batch_queue.put(EpochEndSignal())
 
-        sample_queue = self._start_async_processing()
-        batch_queue = self._manager.Queue(self._batch_buffer_size)
+        batch_queue = Queue.Queue(self._batch_buffer_size)
 
-        self._pool_manager = SharedMemoryPoolManager(self._batch_buffer_size *
-                                                     3, self._manager)
-
-        assembling_proc = DaemonProcessGroup(
-            proc_num=1,
+        assembling_thread = Thread(
             target=batch_assembling_task,
-            args=(sample_queue, batch_queue, self._pool_manager.pool))
-        assembling_proc.start_all()
+            args=(self._sample_generator, batch_queue))
+        assembling_thread.daemon = True
+        assembling_thread.start()
 
         while self._force_exit == False:
             try:
@@ -440,6 +457,3 @@ class AsyncDataReader(object):
                 if isinstance(batch_data, EpochEndSignal):
                     break
                 yield batch_data
-
-        # clean the shared memory
-        del self._pool_manager
diff --git a/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py b/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py
index 157ab02eee0093fe5d683e642b3d18d842cb4e19..9f76a9f8590d5f148398c4ffaff77dc95421df83 100644
--- a/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py
+++ b/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py
@@ -22,7 +22,7 @@ class TestTransMeanVarianceNorm(unittest.TestCase):
         feature = np.zeros((2, 120), dtype="float32")
         feature.fill(1)
         trans = trans_mean_variance_norm.TransMeanVarianceNorm(self._file_path)
-        (feature1, label1) = trans.perform_trans((feature, None))
+        (feature1, label1, name) = trans.perform_trans((feature, None, None))
         (mean, var) = trans.get_mean_var()
         feature_flat1 = feature1.flatten()
         feature_flat = feature.flatten()
@@ -70,7 +70,7 @@ class TestTransAddDelta(unittest.TestCase):
         feature[2, 0:40].fill(3)
         feature[3, 0:40].fill(4)
         trans = trans_add_delta.TransAddDelta()
-        (feature, label) = trans.perform_trans((feature, None))
+        (feature, label, name) = trans.perform_trans((feature, None, None))
         self.assertAlmostEqual(feature.shape[0], 4)
         self.assertAlmostEqual(feature.shape[1], 120)
         self.assertAlmostEqual(1.0, feature[0][0])
@@ -93,7 +93,7 @@ class TestTransSplict(unittest.TestCase):
             feature[i, :].fill(i)
 
         trans = trans_splice.TransSplice()
-        (feature, label) = trans.perform_trans((feature, None))
+        (feature, label, name) = trans.perform_trans((feature, None, None))
         self.assertEqual(feature.shape[1], 110)
 
         for i in xrange(8):
diff --git a/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py b/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py
index dc1a4fa45be38152eba773c35e67d0ad3e4a13cb..aa8062f87c932b76dd8a79db825d07e8be273857 100644
--- a/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py
+++ b/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py
@@ -32,9 +32,9 @@ class TransAddDelta(object):
             Args: 
                 sample(object,tuple): contain feature numpy and label numpy
             Returns:
-                (feature, label)
+                (feature, label, name)
         """
-        (feature, label) = sample
+        (feature, label, name) = sample
         frame_dim = feature.shape[1]
         d_frame_dim = frame_dim * 3
         head_filled = 5
@@ -64,7 +64,7 @@ class TransAddDelta(object):
                       start * d_frame_dim + 2 * frame_dim, frame_dim, nframe,
                       d_frame_dim)
         mat.shape = tmp_shape
-        return (mat[head_filled:mat.shape[0] - tail_filled, :], label)
+        return (mat[head_filled:mat.shape[0] - tail_filled, :], label, name)
 
     def _regress(self, data_in, start_in, data_out, start_out, size, n, step):
         """ regress
diff --git a/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py b/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py
index 5b541d426c61364639f7a9d9f50bd51a2c06efa5..9f91b726ea2bcd432340cd06a3cb9006cd5f83f4 100644
--- a/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py
+++ b/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py
@@ -53,9 +53,9 @@ class TransMeanVarianceNorm(object):
             Args:
                 sample(object):input sample, contain feature numpy and label numpy
             Returns:
-                (feature, label)
+                (feature, label, name)
         """
-        (feature, label) = sample
+        (feature, label, name) = sample
         shape = feature.shape
         assert len(shape) == 2
         nfeature_len = shape[0] * shape[1]
@@ -68,4 +68,4 @@ class TransMeanVarianceNorm(object):
             feature[ncur_idx:ncur_idx + self._nLen] = block
             ncur_idx += self._nLen
         feature = feature.reshape(shape)
-        return (feature, label)
+        return (feature, label, name)
diff --git a/fluid/DeepASR/data_utils/augmentor/trans_splice.py b/fluid/DeepASR/data_utils/augmentor/trans_splice.py
index 94f5258de316045d41999b26c6963f8487e9c55a..1fab3d6b442c1613f18d16fd0b0ee89464dbeb2c 100644
--- a/fluid/DeepASR/data_utils/augmentor/trans_splice.py
+++ b/fluid/DeepASR/data_utils/augmentor/trans_splice.py
@@ -30,9 +30,9 @@ class TransSplice(object):
         Args:
             sample(object): input sample(feature, label)
         Return:
-            (feature, label)
+            (feature, label, name)
         """
-        (feature, label) = sample
+        (feature, label, name) = sample
         nframe_num = feature.shape[0]
         nframe_dim = feature.shape[1]
         nnew_frame_dim = nframe_dim * (
@@ -61,4 +61,4 @@ class TransSplice(object):
             np.copyto(ret[i * nnew_frame_dim:(i + 1) * nnew_frame_dim],
                       mat[i * nframe_dim:i * nframe_dim + nnew_frame_dim])
         ret = ret.reshape((nframe_num, nnew_frame_dim))
-        return (ret, label)
+        return (ret, label, name)
diff --git a/fluid/DeepASR/data_utils/util.py b/fluid/DeepASR/data_utils/util.py
index 5d519c0ac30cc63c967f25503ca9dff1def59a8e..0a48f4696547377dbe89934355e8eaac38966fab 100644
--- a/fluid/DeepASR/data_utils/util.py
+++ b/fluid/DeepASR/data_utils/util.py
@@ -1,11 +1,9 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import sys, time
+import sys
 from six import reraise
 from tblib import Traceback
-from multiprocessing import Manager, Process
-import posix_ipc, mmap
 
 import numpy as np
 
@@ -37,19 +35,6 @@ def lodtensor_to_ndarray(lod_tensor):
     return ret, lod_tensor.lod()
 
 
-def batch_to_ndarray(batch_samples, lod):
-    frame_dim = batch_samples[0][0].shape[1]
-    batch_feature = np.zeros((lod[-1], frame_dim), dtype="float32")
-    batch_label = np.zeros((lod[-1], 1), dtype="int64")
-    start = 0
-    for sample in batch_samples:
-        frame_num = sample[0].shape[0]
-        batch_feature[start:start + frame_num, :] = sample[0]
-        batch_label[start:start + frame_num, :] = sample[1]
-        start += frame_num
-    return (batch_feature, batch_label)
-
-
 def split_infer_result(infer_seq, lod):
     infer_batch = []
     for i in xrange(0, len(lod[0]) - 1):
@@ -57,127 +42,10 @@ def split_infer_result(infer_seq, lod):
     return infer_batch
 
 
-class DaemonProcessGroup(object):
-    def __init__(self, proc_num, target, args):
-        self._proc_num = proc_num
-        self._workers = [
-            Process(
-                target=target, args=args) for _ in xrange(self._proc_num)
-        ]
-
-    def start_all(self):
-        for w in self._workers:
-            w.daemon = True
-            w.start()
-
-    @property
-    def proc_num(self):
-        return self._proc_num
-
-
-class EpochEndSignal(object):
-    pass
-
-
 class CriticalException(Exception):
     pass
 
 
-class SharedNDArray(object):
-    """SharedNDArray utilizes shared memory to avoid data serialization when
-    data object shared among different processes. We can reconstruct the
-    `ndarray` when memory address, shape and dtype provided.
-
-    Args:
-        name (str): Address name of shared memory.
-        whether_verify (bool): Whether to validate the writing operation.
-    """
-
-    def __init__(self, name, whether_verify=False):
-        self._name = name
-        self._shm = None
-        self._buf = None
-        self._array = np.zeros(1, dtype=np.float32)
-        self._inited = False
-        self._whether_verify = whether_verify
-
-    def zeros_like(self, shape, dtype):
-        size = int(np.prod(shape)) * np.dtype(dtype).itemsize
-        if self._inited:
-            self._shm = posix_ipc.SharedMemory(self._name)
-        else:
-            self._shm = posix_ipc.SharedMemory(
-                self._name, posix_ipc.O_CREAT, size=size)
-        self._buf = mmap.mmap(self._shm.fd, size)
-        self._array = np.ndarray(shape, dtype, self._buf, order='C')
-
-    def copy(self, ndarray):
-        size = int(np.prod(ndarray.shape)) * np.dtype(ndarray.dtype).itemsize
-        self.zeros_like(ndarray.shape, ndarray.dtype)
-        self._array[:] = ndarray
-        self._buf.flush()
-        self._inited = True
-
-        if self._whether_verify:
-            shm = posix_ipc.SharedMemory(self._name)
-            buf = mmap.mmap(shm.fd, size)
-            array = np.ndarray(ndarray.shape, ndarray.dtype, buf, order='C')
-            np.testing.assert_array_equal(array, ndarray)
-
-    @property
-    def ndarray(self):
-        return self._array
-
-    def recycle(self, pool):
-        self._buf.close()
-        self._shm.close_fd()
-        self._inited = False
-        pool[self._name] = self
-
-    def __getstate__(self):
-        return (self._name, self._array.shape, self._array.dtype, self._inited,
-                self._whether_verify)
-
-    def __setstate__(self, state):
-        self._name = state[0]
-        self._inited = state[3]
-        self.zeros_like(state[1], state[2])
-        self._whether_verify = state[4]
-
-
-class SharedMemoryPoolManager(object):
-    """SharedMemoryPoolManager maintains a multiprocessing.Manager.dict object.
-    All available addresses are allocated once and will be reused. Though this
-    class is not process-safe, the pool can be shared between processes. All
-    shared memory should be unlinked before the main process exited.
-
-    Args:
-        pool_size (int): Size of shared memory pool.
-        manager (dict): A multiprocessing.Manager object, the pool is
-                        maintained by the proxy process.
-        name_prefix (str): Address prefix of shared memory.
-    """
-
-    def __init__(self, pool_size, manager, name_prefix='/deep_asr'):
-        self._names = []
-        self._dict = manager.dict()
-        self._time_prefix = time.strftime('%Y%m%d%H%M%S')
-
-        for i in xrange(pool_size):
-            name = name_prefix + '_' + self._time_prefix + '_' + str(i)
-            self._dict[name] = SharedNDArray(name)
-            self._names.append(name)
-
-    @property
-    def pool(self):
-        return self._dict
-
-    def __del__(self):
-        for name in self._names:
-            # have to unlink the shared memory
-            posix_ipc.unlink_shared_memory(name)
-
-
 def suppress_signal(signo, stack_frame):
     pass
 
diff --git a/fluid/DeepASR/decoder/post_decode_faster.cc b/fluid/DeepASR/decoder/post_decode_faster.cc
index d7f1d1ab34a18285d1d96b9ff6a67cff42d519b3..ce2b45bc6cecec5466f3d20841e5b8ba38151a6c 100644
--- a/fluid/DeepASR/decoder/post_decode_faster.cc
+++ b/fluid/DeepASR/decoder/post_decode_faster.cc
@@ -21,14 +21,15 @@ using fst::StdArc;
 
 Decoder::Decoder(std::string word_syms_filename,
                  std::string fst_in_filename,
-                 std::string logprior_rxfilename) {
+                 std::string logprior_rxfilename,
+                 kaldi::BaseFloat acoustic_scale) {
   const char* usage =
       "Decode, reading log-likelihoods (of transition-ids or whatever symbol "
       "is on the graph) as matrices.";
 
   kaldi::ParseOptions po(usage);
   binary = true;
-  acoustic_scale = 1.5;
+  this->acoustic_scale = acoustic_scale;
   allow_partial = true;
   kaldi::FasterDecoderOptions decoder_opts;
   decoder_opts.Register(&po, true);  // true == include obscure settings.
diff --git a/fluid/DeepASR/decoder/post_decode_faster.h b/fluid/DeepASR/decoder/post_decode_faster.h
index 2e31a1c19e40bd879a1c76f1542b94eaa853be12..8bade8d6988f02ef4caab8ecf6fc50209aa3642a 100644
--- a/fluid/DeepASR/decoder/post_decode_faster.h
+++ b/fluid/DeepASR/decoder/post_decode_faster.h
@@ -29,7 +29,8 @@ class Decoder {
 public:
   Decoder(std::string word_syms_filename,
           std::string fst_in_filename,
-          std::string logprior_rxfilename);
+          std::string logprior_rxfilename,
+          kaldi::BaseFloat acoustic_scale);
   ~Decoder();
 
   // Interface to accept the scores read from specifier and return
diff --git a/fluid/DeepASR/decoder/pybind.cc b/fluid/DeepASR/decoder/pybind.cc
index 56439d180263b4d753eccd82826d1b39c9d2fa85..90ea38ffb535677dc66d74fc64ff3fe4a27bf824 100644
--- a/fluid/DeepASR/decoder/pybind.cc
+++ b/fluid/DeepASR/decoder/pybind.cc
@@ -23,7 +23,7 @@ PYBIND11_MODULE(post_decode_faster, m) {
   m.doc() = "Decoder for Deep ASR model";
 
   py::class_<Decoder>(m, "Decoder")
-      .def(py::init<std::string, std::string, std::string>())
+      .def(py::init<std::string, std::string, std::string, kaldi::BaseFloat>())
       .def("decode",
            (std::vector<std::string> (Decoder::*)(std::string)) &
                Decoder::decode,
diff --git a/fluid/DeepASR/examples/aishell/prepare_data.sh b/fluid/DeepASR/examples/aishell/prepare_data.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d2c051c4d9ea10547f5ba4cc20213f430bf6dfce
--- /dev/null
+++ b/fluid/DeepASR/examples/aishell/prepare_data.sh
@@ -0,0 +1,37 @@
+data_dir=~/.cache/paddle/dataset/speech/deep_asr_data/aishell
+data_url='http://deep-asr-data.gz.bcebos.com/aishell_data.tar.gz'
+lst_url='http://deep-asr-data.gz.bcebos.com/aishell_lst.tar.gz'
+md5=e017d858d9e509c8a84b73f673f08b9a
+
+if [ ! -e $data_dir ]; then
+    mkdir -p $data_dir
+fi
+
+if [ ! -e $data_dir/aishell_data.tar.gz ]; then
+    echo "Download $data_dir/aishell_data.tar.gz ..."
+    wget -c  -P $data_dir $data_url
+else
+    echo "Skip downloading for $data_dir/aishell_data.tar.gz has already existed!"
+fi
+
+echo "Checking md5 sum ..."
+md5sum_tmp=`md5sum $data_dir/aishell_data.tar.gz | cut -d ' ' -f1`
+
+if [ $md5sum_tmp !=  $md5 ]; then
+    echo "Md5sum check failed, please remove and redownload "
+          "$data_dir/aishell_data.tar.gz"
+    exit 1
+fi
+
+echo "Untar aishell_data.tar.gz ..."
+tar xzf $data_dir/aishell_data.tar.gz -C $data_dir
+
+if [ ! -e data ]; then
+    mkdir data
+fi
+
+echo "Download and untar lst files ..."
+wget -c -P data $lst_url
+tar xvf data/aishell_lst.tar.gz -C data
+
+ln -s $data_dir data/aishell
diff --git a/fluid/DeepASR/examples/aishell/train.sh b/fluid/DeepASR/examples/aishell/train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..41c0df2cd4985ae555f70554f27ff0dde8cb0cbe
--- /dev/null
+++ b/fluid/DeepASR/examples/aishell/train.sh
@@ -0,0 +1,13 @@
+export CUDA_VISIBLE_DEVICES=2,3,4,5
+python -u ../../train.py --train_feature_lst data/train_feature.lst \
+                   --train_label_lst data/train_label.lst \
+                   --val_feature_lst data/val_feature.lst \
+                   --val_label_lst data/val_label.lst \
+                   --mean_var data/aishell/global_mean_var \
+                   --checkpoints checkpoints \
+                   --frame_dim 2640  \
+                   --class_num 101  \
+                   --infer_models '' \
+                   --batch_size 128 \
+                   --learning_rate 0.00016 \
+                   --parallel
diff --git a/fluid/DeepASR/infer.py b/fluid/DeepASR/infer.py
index babcb416ea884081ae249a8d1dc177f85cf1c9ba..84269261a95c381a9be21425abf43b98006f0886 100644
--- a/fluid/DeepASR/infer.py
+++ b/fluid/DeepASR/infer.py
@@ -8,7 +8,7 @@ import paddle.fluid as fluid
 import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
 import data_utils.augmentor.trans_add_delta as trans_add_delta
 import data_utils.augmentor.trans_splice as trans_splice
-import data_utils.data_reader as reader
+import data_utils.async_data_reader as reader
 from data_utils.util import lodtensor_to_ndarray
 from data_utils.util import split_infer_result
 
@@ -79,12 +79,13 @@ def infer(args):
         trans_splice.TransSplice()
     ]
 
-    infer_data_reader = reader.DataReader(args.infer_feature_lst,
-                                          args.infer_label_lst)
+    infer_data_reader = reader.AsyncDataReader(args.infer_feature_lst,
+                                               args.infer_label_lst)
     infer_data_reader.set_transformers(ltrans)
 
     feature_t = fluid.LoDTensor()
     one_batch = infer_data_reader.batch_iterator(args.batch_size, 1).next()
+
     (features, labels, lod) = one_batch
     feature_t.set(features, place)
     feature_t.set_lod([lod])
diff --git a/fluid/DeepASR/infer_by_ckpt.py b/fluid/DeepASR/infer_by_ckpt.py
index f267f674986a87d552bb1a2a277c21c27cca148a..4a4073c02279bfd74b8ce31d0877a5338400d93b 100644
--- a/fluid/DeepASR/infer_by_ckpt.py
+++ b/fluid/DeepASR/infer_by_ckpt.py
@@ -17,6 +17,7 @@ from decoder.post_decode_faster import Decoder
 from data_utils.util import lodtensor_to_ndarray
 from model_utils.model import stacked_lstmp_model
 from data_utils.util import split_infer_result
+from tools.error_rate import char_errors
 
 
 def parse_args():
@@ -86,6 +87,11 @@ def parse_args():
         type=str,
         default='data/infer_label.lst',
         help='The label list path for inference. (default: %(default)s)')
+    parser.add_argument(
+        '--ref_txt',
+        type=str,
+        default='data/text.test',
+        help='The reference text for decoding. (default: %(default)s)')
     parser.add_argument(
         '--checkpoint',
         type=str,
@@ -106,6 +112,16 @@ def parse_args():
         type=str,
         default="./decoder/logprior",
         help="The log prior probs for training data. (default: %(default)s)")
+    parser.add_argument(
+        '--acoustic_scale',
+        type=float,
+        default=0.2,
+        help="Scaling factor for acoustic likelihoods. (default: %(default)f)")
+    parser.add_argument(
+        '--target_trans',
+        type=str,
+        default="./decoder/target_trans.txt",
+        help="The path to target transcription. (default: %(default)s)")
     args = parser.parse_args()
     return args
 
@@ -117,6 +133,18 @@ def print_arguments(args):
     print('------------------------------------------------')
 
 
+def get_trg_trans(args):
+    trans_dict = {}
+    with open(args.target_trans) as trg_trans:
+        line = trg_trans.readline()
+        while line:
+            items = line.strip().split()
+            key = items[0]
+            trans_dict[key] = ''.join(items[1:])
+            line = trg_trans.readline()
+    return trans_dict
+
+
 def infer_from_ckpt(args):
     """Inference by using checkpoint."""
 
@@ -140,9 +168,14 @@ def infer_from_ckpt(args):
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
 
+    trg_trans = get_trg_trans(args)
     # load checkpoint.
     fluid.io.load_persistables(exe, args.checkpoint)
 
+    # init decoder
+    decoder = Decoder(args.vocabulary, args.graphs, args.log_prior,
+                      args.acoustic_scale)
+
     ltrans = [
         trans_add_delta.TransAddDelta(2, 2),
         trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var),
@@ -157,17 +190,16 @@ def infer_from_ckpt(args):
                                                args.infer_label_lst)
     infer_data_reader.set_transformers(ltrans)
     infer_costs, infer_accs = [], []
+    total_edit_dist, total_ref_len = 0.0, 0
     for batch_id, batch_data in enumerate(
             infer_data_reader.batch_iterator(args.batch_size,
                                              args.minimum_batch_size)):
         # load_data
-        (features, labels, lod) = batch_data
-        feature_t.set(features.ndarray, place)
-        feature_t.set_lod([lod.ndarray])
-        label_t.set(labels.ndarray, place)
-        label_t.set_lod([lod.ndarray])
-
-        infer_data_reader.recycle(features, labels, lod)
+        (features, labels, lod, name_lst) = batch_data
+        feature_t.set(features, place)
+        feature_t.set_lod([lod])
+        label_t.set(labels, place)
+        label_t.set_lod([lod])
 
         results = exe.run(infer_program,
                           feed={"feature": feature_t,
@@ -179,11 +211,19 @@ def infer_from_ckpt(args):
 
         probs, lod = lodtensor_to_ndarray(results[0])
         infer_batch = split_infer_result(probs, lod)
-        for index, sample in enumerate(infer_batch):
-            key = "utter#%d" % (batch_id * args.batch_size + index)
-            print(key, ": ", decoder.decode(key, sample), "\n")
 
-    print(np.mean(infer_costs), np.mean(infer_accs))
+        for index, sample in enumerate(infer_batch):
+            key = name_lst[index]
+            ref = trg_trans[key]
+            hyp = decoder.decode(key, sample)
+            edit_dist, ref_len = char_errors(ref.decode("utf8"), hyp)
+            total_edit_dist += edit_dist
+            total_ref_len += ref_len
+            print(key + "|Ref:", ref)
+            print(key + "|Hyp:", hyp.encode("utf8"))
+            print("Instance CER: ", edit_dist / ref_len)
+
+    print("Total CER = %f" % (total_edit_dist / total_ref_len))
 
 
 if __name__ == '__main__':
diff --git a/fluid/DeepASR/tools/error_rate.py b/fluid/DeepASR/tools/error_rate.py
new file mode 100644
index 0000000000000000000000000000000000000000..215ad39d24a551879d0fd8d4c8892161a0708370
--- /dev/null
+++ b/fluid/DeepASR/tools/error_rate.py
@@ -0,0 +1,182 @@
+# -*- coding: utf-8 -*-
+"""This module provides functions to calculate error rate in different level.
+e.g. wer for word-level, cer for char-level.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def _levenshtein_distance(ref, hyp):
+    """Levenshtein distance is a string metric for measuring the difference
+    between two sequences. Informally, the levenshtein disctance is defined as
+    the minimum number of single-character edits (substitutions, insertions or
+    deletions) required to change one word into the other. We can naturally
+    extend the edits to word level when calculate levenshtein disctance for
+    two sentences.
+    """
+    m = len(ref)
+    n = len(hyp)
+
+    # special case
+    if ref == hyp:
+        return 0
+    if m == 0:
+        return n
+    if n == 0:
+        return m
+
+    if m < n:
+        ref, hyp = hyp, ref
+        m, n = n, m
+
+    # use O(min(m, n)) space
+    distance = np.zeros((2, n + 1), dtype=np.int32)
+
+    # initialize distance matrix
+    for j in xrange(n + 1):
+        distance[0][j] = j
+
+    # calculate levenshtein distance
+    for i in xrange(1, m + 1):
+        prev_row_idx = (i - 1) % 2
+        cur_row_idx = i % 2
+        distance[cur_row_idx][0] = i
+        for j in xrange(1, n + 1):
+            if ref[i - 1] == hyp[j - 1]:
+                distance[cur_row_idx][j] = distance[prev_row_idx][j - 1]
+            else:
+                s_num = distance[prev_row_idx][j - 1] + 1
+                i_num = distance[cur_row_idx][j - 1] + 1
+                d_num = distance[prev_row_idx][j] + 1
+                distance[cur_row_idx][j] = min(s_num, i_num, d_num)
+
+    return distance[m % 2][n]
+
+
+def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
+    """Compute the levenshtein distance between reference sequence and
+    hypothesis sequence in word-level.
+    :param reference: The reference sentence.
+    :type reference: basestring
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: basestring
+    :param ignore_case: Whether case-sensitive or not.
+    :type ignore_case: bool
+    :param delimiter: Delimiter of input sentences.
+    :type delimiter: char
+    :return: Levenshtein distance and word number of reference sentence.
+    :rtype: list
+    """
+    if ignore_case == True:
+        reference = reference.lower()
+        hypothesis = hypothesis.lower()
+
+    ref_words = filter(None, reference.split(delimiter))
+    hyp_words = filter(None, hypothesis.split(delimiter))
+
+    edit_distance = _levenshtein_distance(ref_words, hyp_words)
+    return float(edit_distance), len(ref_words)
+
+
+def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
+    """Compute the levenshtein distance between reference sequence and
+    hypothesis sequence in char-level.
+    :param reference: The reference sentence.
+    :type reference: basestring
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: basestring
+    :param ignore_case: Whether case-sensitive or not.
+    :type ignore_case: bool
+    :param remove_space: Whether remove internal space characters
+    :type remove_space: bool
+    :return: Levenshtein distance and length of reference sentence.
+    :rtype: list
+    """
+    if ignore_case == True:
+        reference = reference.lower()
+        hypothesis = hypothesis.lower()
+
+    join_char = ' '
+    if remove_space == True:
+        join_char = ''
+
+    reference = join_char.join(filter(None, reference.split(' ')))
+    hypothesis = join_char.join(filter(None, hypothesis.split(' ')))
+
+    edit_distance = _levenshtein_distance(reference, hypothesis)
+    return float(edit_distance), len(reference)
+
+
+def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
+    """Calculate word error rate (WER). WER compares reference text and
+    hypothesis text in word-level. WER is defined as:
+    .. math::
+        WER = (Sw + Dw + Iw) / Nw
+    where
+    .. code-block:: text
+        Sw is the number of words subsituted,
+        Dw is the number of words deleted,
+        Iw is the number of words inserted,
+        Nw is the number of words in the reference
+    We can use levenshtein distance to calculate WER. Please draw an attention
+    that empty items will be removed when splitting sentences by delimiter.
+    :param reference: The reference sentence.
+    :type reference: basestring
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: basestring
+    :param ignore_case: Whether case-sensitive or not.
+    :type ignore_case: bool
+    :param delimiter: Delimiter of input sentences.
+    :type delimiter: char
+    :return: Word error rate.
+    :rtype: float
+    :raises ValueError: If word number of reference is zero.
+    """
+    edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case,
+                                         delimiter)
+
+    if ref_len == 0:
+        raise ValueError("Reference's word number should be greater than 0.")
+
+    wer = float(edit_distance) / ref_len
+    return wer
+
+
+def cer(reference, hypothesis, ignore_case=False, remove_space=False):
+    """Calculate charactor error rate (CER). CER compares reference text and
+    hypothesis text in char-level. CER is defined as:
+    .. math::
+        CER = (Sc + Dc + Ic) / Nc
+    where
+    .. code-block:: text
+        Sc is the number of characters substituted,
+        Dc is the number of characters deleted,
+        Ic is the number of characters inserted
+        Nc is the number of characters in the reference
+    We can use levenshtein distance to calculate CER. Chinese input should be
+    encoded to unicode. Please draw an attention that the leading and tailing
+    space characters will be truncated and multiple consecutive space
+    characters in a sentence will be replaced by one space character.
+    :param reference: The reference sentence.
+    :type reference: basestring
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: basestring
+    :param ignore_case: Whether case-sensitive or not.
+    :type ignore_case: bool
+    :param remove_space: Whether remove internal space characters
+    :type remove_space: bool
+    :return: Character error rate.
+    :rtype: float
+    :raises ValueError: If the reference length is zero.
+    """
+    edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case,
+                                         remove_space)
+
+    if ref_len == 0:
+        raise ValueError("Length of reference should be greater than 0.")
+
+    cer = float(edit_distance) / ref_len
+    return cer
diff --git a/fluid/DeepASR/tools/profile.py b/fluid/DeepASR/tools/profile.py
index cf7329445393a3e767f35cd23939dc6777e06633..8d720c16cd0ec6a9d4bb533a878b07973ced7176 100644
--- a/fluid/DeepASR/tools/profile.py
+++ b/fluid/DeepASR/tools/profile.py
@@ -168,15 +168,13 @@ def profile(args):
                 start_time = time.time()
                 frames_seen = 0
             # load_data
-            (features, labels, lod) = batch_data
-            feature_t.set(features.ndarray, place)
-            feature_t.set_lod([lod.ndarray])
-            label_t.set(labels.ndarray, place)
-            label_t.set_lod([lod.ndarray])
+            (features, labels, lod, _) = batch_data
+            feature_t.set(features, place)
+            feature_t.set_lod([lod])
+            label_t.set(labels, place)
+            label_t.set_lod([lod])
 
-            frames_seen += lod.ndarray[-1]
-
-            data_reader.recycle(features, labels, lod)
+            frames_seen += lod[-1]
 
             outs = exe.run(fluid.default_main_program(),
                            feed={"feature": feature_t,
diff --git a/fluid/DeepASR/train.py b/fluid/DeepASR/train.py
index 446e9e0ab16b1d1ee98738ca8cc1510e0e96636e..be99998c8aa7f88d49dab711e94dcd7cfef042d6 100644
--- a/fluid/DeepASR/train.py
+++ b/fluid/DeepASR/train.py
@@ -192,13 +192,11 @@ def train(args):
                 test_data_reader.batch_iterator(args.batch_size,
                                                 args.minimum_batch_size)):
             # load_data
-            (features, labels, lod) = batch_data
-            feature_t.set(features.ndarray, place)
-            feature_t.set_lod([lod.ndarray])
-            label_t.set(labels.ndarray, place)
-            label_t.set_lod([lod.ndarray])
-
-            test_data_reader.recycle(features, labels, lod)
+            (features, labels, lod, _) = batch_data
+            feature_t.set(features, place)
+            feature_t.set_lod([lod])
+            label_t.set(labels, place)
+            label_t.set_lod([lod])
 
             cost, acc = exe.run(test_program,
                                 feed={"feature": feature_t,
@@ -212,6 +210,7 @@ def train(args):
     # train data reader
     train_data_reader = reader.AsyncDataReader(args.train_feature_lst,
                                                args.train_label_lst, -1)
+
     train_data_reader.set_transformers(ltrans)
     # train
     for pass_id in xrange(args.pass_num):
@@ -220,13 +219,11 @@ def train(args):
                 train_data_reader.batch_iterator(args.batch_size,
                                                  args.minimum_batch_size)):
             # load_data
-            (features, labels, lod) = batch_data
-            feature_t.set(features.ndarray, place)
-            feature_t.set_lod([lod.ndarray])
-            label_t.set(labels.ndarray, place)
-            label_t.set_lod([lod.ndarray])
-
-            train_data_reader.recycle(features, labels, lod)
+            (features, labels, lod, name_lst) = batch_data
+            feature_t.set(features, place)
+            feature_t.set_lod([lod])
+            label_t.set(labels, place)
+            label_t.set_lod([lod])
 
             to_print = batch_id > 0 and (batch_id % args.print_per_batches == 0)
             outs = exe.run(fluid.default_main_program(),
diff --git a/fluid/adversarial/README.md b/fluid/adversarial/README.md
index e052361c2ae9fdb77babd820a92a4091e1439987..91661f7e1675d59c7d38c4c09bc67d5b9339573d 100644
--- a/fluid/adversarial/README.md
+++ b/fluid/adversarial/README.md
@@ -4,10 +4,109 @@ The minimum PaddlePaddle version needed for the code sample in this directory is
 
 # Advbox
 
-Advbox is a Python toolbox to create adversarial examples that fool neural networks. It requires Python and paddle.
+Advbox is a toolbox to generate adversarial examples that fool neural networks and Advbox can benchmark the robustness of machine learning models.
 
-## How to use
+The Advbox is based on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) Fluid and is under continual development, always welcoming contributions of the latest method of adversarial attacks and defenses.
 
-1. train a model and save it's parameters. (like fluid_mnist.py)
-2. load the parameters which is trained in step1, then reconstruct the model.(like mnist_tutorial_fgsm.py)
-3. use advbox to generate the adversarial sample.
+
+## Overview
+[Szegedy et al.](https://arxiv.org/abs/1312.6199) discovered an intriguing properties of deep neural networks in the context of image classification for the first time. They showed that despite the state-of-the-art deep networks are surprisingly susceptible to adversarial attacks in the form of small perturbations to images that remain (almost) imperceptible to human vision system. These perturbations are found by optimizing the input to maximize the prediction error and the images modified by these perturbations are called as `adversarial examples`. The profound implications of these results triggered a wide interest of researchers in adversarial attacks and their defenses for deep learning in general.
+
+Advbox is similar to [Foolbox](https://github.com/bethgelab/foolbox) and [CleverHans](https://github.com/tensorflow/cleverhans). CleverHans only supports TensorFlow framework while foolbox interfaces with many popular machine learning frameworks such as PyTorch, Keras, TensorFlow, Theano, Lasagne and MXNet. However, these two great libraries don't support PaddlePaddle, an easy-to-use, efficient, flexible and scalable deep learning platform which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu.
+
+## Usage
+Advbox provides many stable reference implementations of modern methods to generate adversarial examples such as FGSM, DeepFool, JSMA. When you want to benchmark the robustness of your neural networks , you can use the advbox to generate some adversarial examples and benchmark the networks. Some tips of using Advbox:
+
+1. Train a model and save the parameters.
+2. Load the parameters which has been trained，then reconstruct the model.
+3. Use advbox to generate the adversarial samples.
+
+
+#### Dependencies
+* PaddlePaddle: [the lastest develop branch](http://www.paddlepaddle.org/docs/develop/documentation/en/build_and_install/pip_install_en.html)
+* Python 2.x
+
+#### Structure
+
+Network models, attack method's implements and the criterion that defines adversarial examples are three essential elements to generate adversarial examples. Misclassification is adopted as the adversarial criterion for briefness in Advbox.
+
+The structure of Advbox module are as follows:
+
+    .
+    ├── advbox
+    |   ├── __init__.py
+    |   ├── attack
+    |        ├── __init__.py
+    |        ├── base.py
+    |        ├── deepfool.py
+    |        ├── gradient_method.py
+    |        ├── lbfgs.py
+    |        └── saliency.py
+    |   ├── models
+    |        ├── __init__.py
+    |        ├── base.py
+    |        └── paddle.py
+    |   └── adversary.py
+    ├── tutorials
+    |   ├── __init__.py
+    |   ├── mnist_model.py
+    |   ├── mnist_tutorial_lbfgs.py
+    |   ├── mnist_tutorial_fgsm.py
+    |   ├── mnist_tutorial_bim.py
+    |   ├── mnist_tutorial_ilcm.py
+    |   ├── mnist_tutorial_mifgsm.py
+    |   ├── mnist_tutorial_jsma.py
+    |   └── mnist_tutorial_deepfool.py
+    └── README.md
+
+**advbox.attack**
+
+Advbox implements several popular adversarial attacks which search adversarial examples. Each attack method uses a distance measure(L1, L2, etc.) to quantify the size of adversarial perturbations. Advbox is easy to craft adversarial example as some attack methods could perform internal hyperparameter tuning to find the minimum perturbation.
+
+**advbox.model**
+
+Advbox implements interfaces to PaddlePaddle. Additionally, other deep learning framworks such as TensorFlow can also be defined and employed. The module is use to compute predictions and gradients for given inputs in a specific framework.
+
+**advbox.adversary**
+
+Adversary contains the original object, the target and the adversarial examples. It provides the misclassification as the criterion to accept a adversarial example.
+
+## Tutorials
+The `./tutorials/` folder provides some tutorials to generate adversarial examples on the MNIST dataset. You can slightly modify the code to apply to other dataset. These attack methods are supported in Advbox:
+
+* [L-BFGS](https://arxiv.org/abs/1312.6199)
+* [FGSM](https://arxiv.org/abs/1412.6572)
+* [BIM](https://arxiv.org/abs/1607.02533)
+* [ILCM](https://arxiv.org/abs/1607.02533)
+* [MI-FGSM](https://arxiv.org/pdf/1710.06081.pdf)
+* [JSMA](https://arxiv.org/pdf/1511.07528)
+* [DeepFool](https://arxiv.org/abs/1511.04599)
+
+## Testing
+Benchmarks on a vanilla CNN model.
+
+> MNIST
+
+|  adversarial attacks  |  fooling rate (non-targeted)  | fooling rate (targeted) | max_epsilon | iterations | Strength |
+|:-----:| :----: | :---: | :----: | :----: | :----: |
+|L-BFGS| --- | 89.2% | --- | One shot | *** |
+|FGSM| 57.8% | 26.55% | 0.3 | One shot| *** |
+|BIM| 97.4% | --- | 0.1 | 100 | **** |
+|ILCM| ---  | 100.0% | 0.1 | 100 | **** |
+|MI-FGSM| 94.4% | 100.0% | 0.1 | 100 | **** |
+|JSMA| 96.8% | 90.4%| 0.1 | 2000 | *** |
+|DeepFool| 97.7% | 51.3% | --- | 100 | **** |
+
+* The strength (higher for more asterisks) is based on the impression from the reviewed literature.
+
+---
+## References
+* [Intriguing properties of neural networks](https://arxiv.org/abs/1312.6199), C. Szegedy et al., arxiv 2014
+* [Explaining and Harnessing Adversarial Examples](https://arxiv.org/abs/1412.6572), I. Goodfellow et al., ICLR 2015
+* [Adversarial Examples In The Physical World](https://arxiv.org/pdf/1607.02533v3.pdf), A. Kurakin et al., ICLR workshop 2017
+* [Boosting Adversarial Attacks with Momentum](https://arxiv.org/abs/1710.06081), Yinpeng Dong et al., arxiv 2018
+* [The Limitations of Deep Learning in Adversarial Settings](https://arxiv.org/abs/1511.07528), N. Papernot et al., ESSP 2016
+* [DeepFool: a simple and accurate method to fool deep neural networks](https://arxiv.org/abs/1511.04599), S. Moosavi-Dezfooli et al., CVPR 2016
+* [Foolbox: A Python toolbox to benchmark the robustness of machine learning models](https://arxiv.org/abs/1707.04131), Jonas Rauber et al., arxiv 2018
+* [CleverHans: An adversarial example library for constructing attacks, building defenses, and benchmarking both](https://github.com/tensorflow/cleverhans#setting-up-cleverhans)
+* [Threat of Adversarial Attacks on Deep Learning in Computer Vision: A Survey](https://arxiv.org/abs/1801.00553), Naveed Akhtar, Ajmal Mian, arxiv 2018
diff --git a/fluid/adversarial/advbox/attacks/gradient_method.py b/fluid/adversarial/advbox/attacks/gradient_method.py
index 25b828d41233dea193aef4d953073af3eafdefb3..146b650c21464279f5527eb4a8bf44593e9dce29 100644
--- a/fluid/adversarial/advbox/attacks/gradient_method.py
+++ b/fluid/adversarial/advbox/attacks/gradient_method.py
@@ -14,7 +14,8 @@ __all__ = [
     'GradientMethodAttack', 'FastGradientSignMethodAttack', 'FGSM',
     'FastGradientSignMethodTargetedAttack', 'FGSMT',
     'BasicIterativeMethodAttack', 'BIM',
-    'IterativeLeastLikelyClassMethodAttack', 'ILCM'
+    'IterativeLeastLikelyClassMethodAttack', 'ILCM', 'MomentumIteratorAttack',
+    'MIFGSM'
 ]
 
 
@@ -32,7 +33,12 @@ class GradientMethodAttack(Attack):
         super(GradientMethodAttack, self).__init__(model)
         self.support_targeted = support_targeted
 
-    def _apply(self, adversary, norm_ord=np.inf, epsilons=0.01, steps=100):
+    def _apply(self,
+               adversary,
+               norm_ord=np.inf,
+               epsilons=0.01,
+               steps=1,
+               epsilon_steps=100):
         """
         Apply the gradient attack method.
         :param adversary(Adversary):
@@ -41,8 +47,11 @@ class GradientMethodAttack(Attack):
             Order of the norm, such as np.inf, 1, 2, etc. It can't be 0.
         :param epsilons(list|tuple|int):
             Attack step size (input variation).
+            Largest step size if epsilons is not iterable.
         :param steps:
-            The number of iterator steps.
+            The number of attack iteration.
+        :param epsilon_steps:
+            The number of Epsilons' iteration for each attack iteration.
         :return:
             adversary(Adversary): The Adversary object.
         """
@@ -55,7 +64,7 @@ class GradientMethodAttack(Attack):
                     "This attack method doesn't support targeted attack!")
 
         if not isinstance(epsilons, Iterable):
-            epsilons = np.linspace(epsilons, epsilons + 1e-10, num=steps)
+            epsilons = np.linspace(0, epsilons, num=epsilon_steps)
 
         pre_label = adversary.original_label
         min_, max_ = self.model.bounds()
@@ -65,30 +74,33 @@ class GradientMethodAttack(Attack):
                 self.model.channel_axis() == adversary.original.shape[0] or
                 self.model.channel_axis() == adversary.original.shape[-1])
 
-        step = 1
-        adv_img = adversary.original
-        for epsilon in epsilons[:steps]:
+        for epsilon in epsilons[:]:
+            step = 1
+            adv_img = adversary.original
             if epsilon == 0.0:
                 continue
-            if adversary.is_targeted_attack:
-                gradient = -self.model.gradient(adv_img, adversary.target_label)
-            else:
-                gradient = self.model.gradient(adv_img,
-                                               adversary.original_label)
-            if norm_ord == np.inf:
-                gradient_norm = np.sign(gradient)
-            else:
-                gradient_norm = gradient / self._norm(gradient, ord=norm_ord)
-
-            adv_img = adv_img + epsilon * gradient_norm * (max_ - min_)
-            adv_img = np.clip(adv_img, min_, max_)
-            adv_label = np.argmax(self.model.predict(adv_img))
-            logging.info('step={}, epsilon = {:.5f}, pre_label = {}, '
-                         'adv_label={}'.format(step, epsilon, pre_label,
-                                               adv_label))
-            if adversary.try_accept_the_example(adv_img, adv_label):
-                return adversary
-            step += 1
+            for i in range(steps):
+                if adversary.is_targeted_attack:
+                    gradient = -self.model.gradient(adv_img,
+                                                    adversary.target_label)
+                else:
+                    gradient = self.model.gradient(adv_img,
+                                                   adversary.original_label)
+                if norm_ord == np.inf:
+                    gradient_norm = np.sign(gradient)
+                else:
+                    gradient_norm = gradient / self._norm(
+                        gradient, ord=norm_ord)
+
+                adv_img = adv_img + epsilon * gradient_norm * (max_ - min_)
+                adv_img = np.clip(adv_img, min_, max_)
+                adv_label = np.argmax(self.model.predict(adv_img))
+                logging.info('step={}, epsilon = {:.5f}, pre_label = {}, '
+                             'adv_label={}'.format(step, epsilon, pre_label,
+                                                   adv_label))
+                if adversary.try_accept_the_example(adv_img, adv_label):
+                    return adversary
+                step += 1
         return adversary
 
     @staticmethod
@@ -113,7 +125,7 @@ class FastGradientSignMethodTargetedAttack(GradientMethodAttack):
     Paper link: https://arxiv.org/abs/1412.6572
     """
 
-    def _apply(self, adversary, epsilons=0.03):
+    def _apply(self, adversary, epsilons=0.01):
         return GradientMethodAttack._apply(
             self,
             adversary=adversary,
@@ -144,7 +156,7 @@ class IterativeLeastLikelyClassMethodAttack(GradientMethodAttack):
     Paper link: https://arxiv.org/abs/1607.02533
     """
 
-    def _apply(self, adversary, epsilons=0.001, steps=1000):
+    def _apply(self, adversary, epsilons=0.01, steps=1000):
         return GradientMethodAttack._apply(
             self,
             adversary=adversary,
@@ -164,7 +176,103 @@ class BasicIterativeMethodAttack(IterativeLeastLikelyClassMethodAttack):
         super(BasicIterativeMethodAttack, self).__init__(model, False)
 
 
+class MomentumIteratorAttack(GradientMethodAttack):
+    """
+    The Momentum Iterative Fast Gradient Sign Method (Dong et al. 2017).
+    This method won the first places in NIPS 2017 Non-targeted Adversarial
+    Attacks and Targeted Adversarial Attacks. The original paper used
+    hard labels for this attack; no label smoothing. inf norm.
+    Paper link: https://arxiv.org/pdf/1710.06081.pdf
+    """
+
+    def __init__(self, model, support_targeted=True):
+        """
+        :param model(model): The model to be attacked.
+        :param support_targeted(bool): Does this attack method support targeted.
+        """
+        super(MomentumIteratorAttack, self).__init__(model)
+        self.support_targeted = support_targeted
+
+    def _apply(self,
+               adversary,
+               norm_ord=np.inf,
+               epsilons=0.1,
+               steps=100,
+               epsilon_steps=100,
+               decay_factor=1):
+        """
+        Apply the momentum iterative gradient attack method.
+        :param adversary(Adversary):
+            The Adversary object.
+        :param norm_ord(int):
+            Order of the norm, such as np.inf, 1, 2, etc. It can't be 0.
+        :param epsilons(list|tuple|float):
+            Attack step size (input variation).
+            Largest step size if epsilons is not iterable.
+        :param epsilon_steps:
+            The number of Epsilons' iteration for each attack iteration.
+        :param steps:
+            The number of attack iteration.
+        :param decay_factor:
+            The decay factor for the momentum term.
+        :return:
+            adversary(Adversary): The Adversary object.
+        """
+        if norm_ord == 0:
+            raise ValueError("L0 norm is not supported!")
+
+        if not self.support_targeted:
+            if adversary.is_targeted_attack:
+                raise ValueError(
+                    "This attack method doesn't support targeted attack!")
+
+        assert self.model.channel_axis() == adversary.original.ndim
+        assert (self.model.channel_axis() == 1 or
+                self.model.channel_axis() == adversary.original.shape[0] or
+                self.model.channel_axis() == adversary.original.shape[-1])
+
+        if not isinstance(epsilons, Iterable):
+            epsilons = np.linspace(0, epsilons, num=epsilon_steps)
+
+        min_, max_ = self.model.bounds()
+        pre_label = adversary.original_label
+
+        for epsilon in epsilons[:]:
+            if epsilon == 0.0:
+                continue
+            step = 1
+            adv_img = adversary.original
+            momentum = 0
+            for i in range(steps):
+                if adversary.is_targeted_attack:
+                    gradient = -self.model.gradient(adv_img,
+                                                    adversary.target_label)
+                else:
+                    gradient = self.model.gradient(adv_img, pre_label)
+
+                # normalize gradient
+                velocity = gradient / self._norm(gradient, ord=1)
+                momentum = decay_factor * momentum + velocity
+                if norm_ord == np.inf:
+                    normalized_grad = np.sign(momentum)
+                else:
+                    normalized_grad = self._norm(momentum, ord=norm_ord)
+                perturbation = epsilon * normalized_grad
+                adv_img = adv_img + perturbation
+                adv_img = np.clip(adv_img, min_, max_)
+                adv_label = np.argmax(self.model.predict(adv_img))
+                logging.info(
+                    'step={}, epsilon = {:.5f}, pre_label = {}, adv_label={}'
+                    .format(step, epsilon, pre_label, adv_label))
+                if adversary.try_accept_the_example(adv_img, adv_label):
+                    return adversary
+                step += 1
+
+        return adversary
+
+
 FGSM = FastGradientSignMethodAttack
 FGSMT = FastGradientSignMethodTargetedAttack
 BIM = BasicIterativeMethodAttack
 ILCM = IterativeLeastLikelyClassMethodAttack
+MIFGSM = MomentumIteratorAttack
diff --git a/fluid/adversarial/mnist_tutorial_fgsm.py b/fluid/adversarial/mnist_tutorial_fgsm.py
deleted file mode 100644
index ea3231695bab8c78aceaf7ba0ba375a5c564d5a0..0000000000000000000000000000000000000000
--- a/fluid/adversarial/mnist_tutorial_fgsm.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""
-FGSM demos on mnist using advbox tool.
-"""
-import matplotlib.pyplot as plt
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-
-from advbox.adversary import Adversary
-from advbox.attacks.gradient_method import FGSM
-from advbox.models.paddle import PaddleModel
-
-
-def cnn_model(img):
-    """
-    Mnist cnn model
-    Args:
-        img(Varaible): the input image to be recognized
-    Returns:
-        Variable: the label prediction
-    """
-    # conv1 = fluid.nets.conv2d()
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        num_filters=20,
-        filter_size=5,
-        pool_size=2,
-        pool_stride=2,
-        act='relu')
-
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        num_filters=50,
-        filter_size=5,
-        pool_size=2,
-        pool_stride=2,
-        act='relu')
-
-    logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
-    return logits
-
-
-def main():
-    """
-    Advbox demo which demonstrate how to use advbox.
-    """
-    IMG_NAME = 'img'
-    LABEL_NAME = 'label'
-
-    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
-    # gradient should flow
-    img.stop_gradient = False
-    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
-    logits = cnn_model(img)
-    cost = fluid.layers.cross_entropy(input=logits, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    BATCH_SIZE = 1
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
-    feeder = fluid.DataFeeder(
-        feed_list=[IMG_NAME, LABEL_NAME],
-        place=place,
-        program=fluid.default_main_program())
-
-    fluid.io.load_params(
-        exe, "./mnist/", main_program=fluid.default_main_program())
-
-    # advbox demo
-    m = PaddleModel(fluid.default_main_program(), IMG_NAME, LABEL_NAME,
-                    logits.name, avg_cost.name, (-1, 1))
-    att = FGSM(m)
-    for data in train_reader():
-        # fgsm attack
-        adversary = att(Adversary(data[0][0], data[0][1]))
-        if adversary.is_successful():
-            plt.imshow(adversary.target, cmap='Greys_r')
-            plt.show()
-            # np.save('adv_img', adversary.target)
-        break
-
-
-if __name__ == '__main__':
-    main()
diff --git a/fluid/adversarial/mnist_tutorial_jsma.py b/fluid/adversarial/mnist_tutorial_jsma.py
deleted file mode 100644
index d9db8b712cb5ca4fbded2119f249c586d2877b50..0000000000000000000000000000000000000000
--- a/fluid/adversarial/mnist_tutorial_jsma.py
+++ /dev/null
@@ -1,97 +0,0 @@
-"""
-FGSM demos on mnist using advbox tool.
-"""
-import matplotlib.pyplot as plt
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import numpy as np
-
-from advbox import Adversary
-from advbox.attacks.saliency import SaliencyMapAttack
-from advbox.models.paddle import PaddleModel
-
-
-def cnn_model(img):
-    """
-    Mnist cnn model
-    Args:
-        img(Varaible): the input image to be recognized
-    Returns:
-        Variable: the label prediction
-    """
-    # conv1 = fluid.nets.conv2d()
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        num_filters=20,
-        filter_size=5,
-        pool_size=2,
-        pool_stride=2,
-        act='relu')
-
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        num_filters=50,
-        filter_size=5,
-        pool_size=2,
-        pool_stride=2,
-        act='relu')
-
-    logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
-    return logits
-
-
-def main():
-    """
-    Advbox demo which demonstrate how to use advbox.
-    """
-    IMG_NAME = 'img'
-    LABEL_NAME = 'label'
-
-    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
-    # gradient should flow
-    img.stop_gradient = False
-    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
-    logits = cnn_model(img)
-    cost = fluid.layers.cross_entropy(input=logits, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    BATCH_SIZE = 1
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
-    feeder = fluid.DataFeeder(
-        feed_list=[IMG_NAME, LABEL_NAME],
-        place=place,
-        program=fluid.default_main_program())
-
-    fluid.io.load_params(
-        exe, "./mnist/", main_program=fluid.default_main_program())
-
-    # advbox demo
-    m = PaddleModel(fluid.default_main_program(), IMG_NAME, LABEL_NAME,
-                    logits.name, avg_cost.name, (-1, 1))
-    attack = SaliencyMapAttack(m)
-    total_num = 0
-    success_num = 0
-    for data in train_reader():
-        total_num += 1
-        # adversary.set_target(True, target_label=target_label)
-        jsma_attack = attack(Adversary(data[0][0], data[0][1]))
-        if jsma_attack is not None and jsma_attack.is_successful():
-            # plt.imshow(jsma_attack.target, cmap='Greys_r')
-            # plt.show()
-            success_num += 1
-            print('original_label=%d, adversary examples label =%d' %
-                  (data[0][1], jsma_attack.adversarial_label))
-            # np.save('adv_img', jsma_attack.adversarial_example)
-        print('total num = %d, success num = %d ' % (total_num, success_num))
-        if total_num == 100:
-            break
-
-
-if __name__ == '__main__':
-    main()
diff --git a/fluid/adversarial/tutorials/__init__.py b/fluid/adversarial/tutorials/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..822d1f6f037ec1f3e4e41498172ebcf67342e3e0
--- /dev/null
+++ b/fluid/adversarial/tutorials/__init__.py
@@ -0,0 +1,3 @@
+"""
+   A set of tutorials for generating adversarial examples with advbox.
+"""
\ No newline at end of file
diff --git a/fluid/adversarial/fluid_mnist.py b/fluid/adversarial/tutorials/mnist_model.py
similarity index 86%
rename from fluid/adversarial/fluid_mnist.py
rename to fluid/adversarial/tutorials/mnist_model.py
index edeb6b0269366392760795cf290b2e3492aff759..81ff7bdec7bedde2e5d1d1013ad95841cb766510 100644
--- a/fluid/adversarial/fluid_mnist.py
+++ b/fluid/adversarial/tutorials/mnist_model.py
@@ -30,8 +30,9 @@ def mnist_cnn_model(img):
         pool_size=2,
         pool_stride=2,
         act='relu')
+    fc = fluid.layers.fc(input=conv_pool_2, size=50, act='relu')
 
-    logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    logits = fluid.layers.fc(input=fc, size=10, act='softmax')
     return logits
 
 
@@ -60,7 +61,10 @@ def main():
             paddle.dataset.mnist.train(), buf_size=500),
         batch_size=BATCH_SIZE)
 
+    # use CPU
     place = fluid.CPUPlace()
+    # use GPU
+    # place = fluid.CUDAPlace(0)
     exe = fluid.Executor(place)
     feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
     exe.run(fluid.default_startup_program())
@@ -74,9 +78,11 @@ def main():
                 feed=feeder.feed(data),
                 fetch_list=[avg_cost, batch_acc, batch_size])
             pass_acc.add(value=acc, weight=b_size)
+            pass_acc_val = pass_acc.eval()[0]
             print("pass_id=" + str(pass_id) + " acc=" + str(acc[0]) +
-                  " pass_acc=" + str(pass_acc.eval()[0]))
-            if loss < LOSS_THRESHOLD and pass_acc > ACC_THRESHOLD:
+                  " pass_acc=" + str(pass_acc_val))
+            if loss < LOSS_THRESHOLD and pass_acc_val > ACC_THRESHOLD:
+                # early stop
                 break
 
         print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc.eval()[
diff --git a/fluid/adversarial/tutorials/mnist_tutorial_bim.py b/fluid/adversarial/tutorials/mnist_tutorial_bim.py
new file mode 100644
index 0000000000000000000000000000000000000000..b490eba302106cf80df009d30e2babe48af465df
--- /dev/null
+++ b/fluid/adversarial/tutorials/mnist_tutorial_bim.py
@@ -0,0 +1,127 @@
+"""
+BIM tutorial on mnist using advbox tool.
+BIM method iteratively take multiple small steps while adjusting the direction after each step.
+It only supports non-targeted attack.
+"""
+import sys
+sys.path.append("..")
+
+import matplotlib.pyplot as plt
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+from advbox.adversary import Adversary
+from advbox.attacks.gradient_method import BIM
+from advbox.models.paddle import PaddleModel
+from tutorials.mnist_model import mnist_cnn_model
+
+
+def main():
+    """
+    Advbox demo which demonstrate how to use advbox.
+    """
+    TOTAL_NUM = 500
+    IMG_NAME = 'img'
+    LABEL_NAME = 'label'
+
+    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
+    # gradient should flow
+    img.stop_gradient = False
+    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
+    logits = mnist_cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # use CPU
+    place = fluid.CPUPlace()
+    # use GPU
+    # place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    BATCH_SIZE = 1
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.test(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    fluid.io.load_params(
+        exe, "./mnist/", main_program=fluid.default_main_program())
+
+    # advbox demo
+    m = PaddleModel(
+        fluid.default_main_program(),
+        IMG_NAME,
+        LABEL_NAME,
+        logits.name,
+        avg_cost.name, (-1, 1),
+        channel_axis=1)
+    attack = BIM(m)
+    attack_config = {"epsilons": 0.1, "steps": 100}
+
+    # use train data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in train_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+
+        # BIM non-targeted attack
+        adversary = attack(adversary, **attack_config)
+
+        if adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TRAIN_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+
+    # use test data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in test_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+
+        # BIM non-targeted attack
+        adversary = attack(adversary, **attack_config)
+
+        if adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TEST_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+    print("bim attack done")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fluid/adversarial/tutorials/mnist_tutorial_deepfool.py b/fluid/adversarial/tutorials/mnist_tutorial_deepfool.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b12c81945859b42809e33ccd74ead53f4d4eb05
--- /dev/null
+++ b/fluid/adversarial/tutorials/mnist_tutorial_deepfool.py
@@ -0,0 +1,137 @@
+"""
+DeepFool tutorial on mnist using advbox tool.
+Deepfool is a simple and accurate adversarial attack method.
+It supports both targeted attack and non-targeted attack.
+"""
+import sys
+sys.path.append("..")
+
+import matplotlib.pyplot as plt
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+from advbox.adversary import Adversary
+from advbox.attacks.deepfool import DeepFoolAttack
+from advbox.models.paddle import PaddleModel
+from tutorials.mnist_model import mnist_cnn_model
+
+
+def main():
+    """
+    Advbox demo which demonstrate how to use advbox.
+    """
+    TOTAL_NUM = 500
+    IMG_NAME = 'img'
+    LABEL_NAME = 'label'
+
+    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
+    # gradient should flow
+    img.stop_gradient = False
+    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
+    logits = mnist_cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # use CPU
+    place = fluid.CPUPlace()
+    # use GPU
+    # place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    BATCH_SIZE = 1
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.test(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    fluid.io.load_params(
+        exe, "./mnist/", main_program=fluid.default_main_program())
+
+    # advbox demo
+    m = PaddleModel(
+        fluid.default_main_program(),
+        IMG_NAME,
+        LABEL_NAME,
+        logits.name,
+        avg_cost.name, (-1, 1),
+        channel_axis=1)
+    attack = DeepFoolAttack(m)
+    attack_config = {"iterations": 100, "overshoot": 9}
+
+    # use train data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in train_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+
+        # DeepFool non-targeted attack
+        adversary = attack(adversary, **attack_config)
+
+        # DeepFool targeted attack
+        # tlabel = 0
+        # adversary.set_target(is_targeted_attack=True, target_label=tlabel)
+        # adversary = attack(adversary, **attack_config)
+
+        if adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TRAIN_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+
+    # use test data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in test_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+
+        # DeepFool non-targeted attack
+        adversary = attack(adversary, **attack_config)
+
+        # DeepFool targeted attack
+        # tlabel = 0
+        # adversary.set_target(is_targeted_attack=True, target_label=tlabel)
+        # adversary = attack(adversary, **attack_config)
+
+        if adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TEST_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+    print("deelfool attack done")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fluid/adversarial/tutorials/mnist_tutorial_fgsm.py b/fluid/adversarial/tutorials/mnist_tutorial_fgsm.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeb7bc477ed090eac547fe3db50b08b2a513f0d7
--- /dev/null
+++ b/fluid/adversarial/tutorials/mnist_tutorial_fgsm.py
@@ -0,0 +1,139 @@
+"""
+FGSM tutorial on mnist using advbox tool.
+FGSM method is non-targeted attack while FGSMT is targeted attack.
+"""
+import sys
+sys.path.append("..")
+
+import matplotlib.pyplot as plt
+import numpy as np
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+from advbox.adversary import Adversary
+from advbox.attacks.gradient_method import FGSM
+from advbox.attacks.gradient_method import FGSMT
+from advbox.models.paddle import PaddleModel
+from tutorials.mnist_model import mnist_cnn_model
+
+
+def main():
+    """
+    Advbox demo which demonstrate how to use advbox.
+    """
+    TOTAL_NUM = 500
+    IMG_NAME = 'img'
+    LABEL_NAME = 'label'
+
+    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
+    # gradient should flow
+    img.stop_gradient = False
+    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
+    logits = mnist_cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # use CPU
+    place = fluid.CPUPlace()
+    # use GPU
+    # place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    BATCH_SIZE = 1
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.test(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    fluid.io.load_params(
+        exe, "./mnist/", main_program=fluid.default_main_program())
+
+    # advbox demo
+    m = PaddleModel(
+        fluid.default_main_program(),
+        IMG_NAME,
+        LABEL_NAME,
+        logits.name,
+        avg_cost.name, (-1, 1),
+        channel_axis=1)
+    attack = FGSM(m)
+    # attack = FGSMT(m)
+    attack_config = {"epsilons": 0.3}
+
+    # use train data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in train_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+
+        # FGSM non-targeted attack
+        adversary = attack(adversary, **attack_config)
+
+        # FGSMT targeted attack
+        # tlabel = 0
+        # adversary.set_target(is_targeted_attack=True, target_label=tlabel)
+        # adversary = attack(adversary, **attack_config)
+
+        if adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TRAIN_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+
+    # use test data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in test_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+
+        # FGSM non-targeted attack
+        adversary = attack(adversary, **attack_config)
+
+        # FGSMT targeted attack
+        # tlabel = 0
+        # adversary.set_target(is_targeted_attack=True, target_label=tlabel)
+        # adversary = attack(adversary, **attack_config)
+
+        if adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TEST_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+    print("fgsm attack done")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fluid/adversarial/tutorials/mnist_tutorial_ilcm.py b/fluid/adversarial/tutorials/mnist_tutorial_ilcm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d155e583415962f62ee7f581d32dd57a6b1cc1b
--- /dev/null
+++ b/fluid/adversarial/tutorials/mnist_tutorial_ilcm.py
@@ -0,0 +1,130 @@
+"""
+ILCM tutorial on mnist using advbox tool.
+ILCM method extends "BIM" to support targeted attack.
+"""
+import sys
+sys.path.append("..")
+
+import matplotlib.pyplot as plt
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+from advbox.adversary import Adversary
+from advbox.attacks.gradient_method import ILCM
+from advbox.models.paddle import PaddleModel
+from tutorials.mnist_model import mnist_cnn_model
+
+
+def main():
+    """
+    Advbox demo which demonstrate how to use advbox.
+    """
+    TOTAL_NUM = 500
+    IMG_NAME = 'img'
+    LABEL_NAME = 'label'
+
+    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
+    # gradient should flow
+    img.stop_gradient = False
+    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
+    logits = mnist_cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # use CPU
+    place = fluid.CPUPlace()
+    # use GPU
+    # place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    BATCH_SIZE = 1
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.test(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    fluid.io.load_params(
+        exe, "./mnist/", main_program=fluid.default_main_program())
+
+    # advbox demo
+    m = PaddleModel(
+        fluid.default_main_program(),
+        IMG_NAME,
+        LABEL_NAME,
+        logits.name,
+        avg_cost.name, (-1, 1),
+        channel_axis=1)
+    attack = ILCM(m)
+    attack_config = {"epsilons": 0.1, "steps": 100}
+
+    # use train data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in train_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+        tlabel = 0
+        adversary.set_target(is_targeted_attack=True, target_label=tlabel)
+
+        # ILCM targeted attack
+        adversary = attack(adversary, **attack_config)
+
+        if adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TRAIN_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+
+    # use test data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in test_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+        tlabel = 0
+        adversary.set_target(is_targeted_attack=True, target_label=tlabel)
+
+        # ILCM targeted attack
+        adversary = attack(adversary, **attack_config)
+
+        if adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TEST_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+    print("ilcm attack done")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fluid/adversarial/tutorials/mnist_tutorial_jsma.py b/fluid/adversarial/tutorials/mnist_tutorial_jsma.py
new file mode 100644
index 0000000000000000000000000000000000000000..070d2f5f5e3bcd50cdfb12f67e7c1a9453f31676
--- /dev/null
+++ b/fluid/adversarial/tutorials/mnist_tutorial_jsma.py
@@ -0,0 +1,142 @@
+"""
+JSMA tutorial on mnist using advbox tool.
+JSMA method supports both targeted attack and non-targeted attack.
+"""
+import sys
+sys.path.append("..")
+
+import matplotlib.pyplot as plt
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+from advbox.adversary import Adversary
+from advbox.attacks.saliency import JSMA
+from advbox.models.paddle import PaddleModel
+from tutorials.mnist_model import mnist_cnn_model
+
+
+def main():
+    """
+    Advbox demo which demonstrate how to use advbox.
+    """
+    TOTAL_NUM = 500
+    IMG_NAME = 'img'
+    LABEL_NAME = 'label'
+
+    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
+    # gradient should flow
+    img.stop_gradient = False
+    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
+    logits = mnist_cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # use CPU
+    place = fluid.CPUPlace()
+    # use GPU
+    # place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    BATCH_SIZE = 1
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.test(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    fluid.io.load_params(
+        exe, "./mnist/", main_program=fluid.default_main_program())
+
+    # advbox demo
+    m = PaddleModel(
+        fluid.default_main_program(),
+        IMG_NAME,
+        LABEL_NAME,
+        logits.name,
+        avg_cost.name, (-1, 1),
+        channel_axis=1)
+    attack = JSMA(m)
+    attack_config = {
+        "max_iter": 2000,
+        "theta": 0.1,
+        "max_perturbations_per_pixel": 7
+    }
+
+    # use train data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in train_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+
+        # JSMA non-targeted attack
+        adversary = attack(adversary, **attack_config)
+
+        # JSMA targeted attack
+        # tlabel = 0
+        # adversary.set_target(is_targeted_attack=True, target_label=tlabel)
+        # adversary = attack(adversary, **attack_config)
+
+        # JSMA may return None
+        if adversary is not None and adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TRAIN_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+
+    # use test data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in test_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+
+        # JSMA non-targeted attack
+        adversary = attack(adversary, **attack_config)
+
+        # JSMA targeted attack
+        # tlabel = 0
+        # adversary.set_target(is_targeted_attack=True, target_label=tlabel)
+        # adversary = attack(adversary, **attack_config)
+
+        # JSMA may return None
+        if adversary is not None and adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TEST_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+    print("jsma attack done")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fluid/adversarial/tutorials/mnist_tutorial_lbfgs.py b/fluid/adversarial/tutorials/mnist_tutorial_lbfgs.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b16c32bb6543409c487b31fe80d8cdc162b55d1
--- /dev/null
+++ b/fluid/adversarial/tutorials/mnist_tutorial_lbfgs.py
@@ -0,0 +1,130 @@
+"""
+LBFGS tutorial on mnist using advbox tool.
+LBFGS method only supports targeted attack.
+"""
+import sys
+sys.path.append("..")
+
+import matplotlib.pyplot as plt
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+from advbox.adversary import Adversary
+from advbox.attacks.lbfgs import LBFGS
+from advbox.models.paddle import PaddleModel
+from tutorials.mnist_model import mnist_cnn_model
+
+
+def main():
+    """
+    Advbox demo which demonstrate how to use advbox.
+    """
+    TOTAL_NUM = 500
+    IMG_NAME = 'img'
+    LABEL_NAME = 'label'
+
+    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
+    # gradient should flow
+    img.stop_gradient = False
+    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
+    logits = mnist_cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # use CPU
+    place = fluid.CPUPlace()
+    # use GPU
+    # place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    BATCH_SIZE = 1
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.test(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    fluid.io.load_params(
+        exe, "./mnist/", main_program=fluid.default_main_program())
+
+    # advbox demo
+    m = PaddleModel(
+        fluid.default_main_program(),
+        IMG_NAME,
+        LABEL_NAME,
+        logits.name,
+        avg_cost.name, (-1, 1),
+        channel_axis=1)
+    attack = LBFGS(m)
+    attack_config = {"epsilon": 0.001, }
+
+    # use train data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in train_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+
+        # LBFGS targeted attack
+        tlabel = 0
+        adversary.set_target(is_targeted_attack=True, target_label=tlabel)
+        adversary = attack(adversary, **attack_config)
+
+        if adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TRAIN_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+
+    # use test data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in test_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+
+        # LBFGS targeted attack
+        tlabel = 0
+        adversary.set_target(is_targeted_attack=True, target_label=tlabel)
+        adversary = attack(adversary, **attack_config)
+
+        if adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TEST_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+    print("lbfgs attack done")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fluid/adversarial/tutorials/mnist_tutorial_mifgsm.py b/fluid/adversarial/tutorials/mnist_tutorial_mifgsm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ded7ef4b19cd4d99d2c3143f703e3d594058f705
--- /dev/null
+++ b/fluid/adversarial/tutorials/mnist_tutorial_mifgsm.py
@@ -0,0 +1,143 @@
+"""
+MIFGSM tutorial on mnist using advbox tool.
+MIFGSM is a broad class of momentum iterative gradient-based methods based on FSGM.
+It supports non-targeted attack and targeted attack.
+"""
+import sys
+sys.path.append("..")
+
+import matplotlib.pyplot as plt
+import numpy as np
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+from advbox.adversary import Adversary
+from advbox.attacks.gradient_method import MIFGSM
+from advbox.models.paddle import PaddleModel
+from tutorials.mnist_model import mnist_cnn_model
+
+
+def main():
+    """
+    Advbox demo which demonstrate how to use advbox.
+    """
+    TOTAL_NUM = 500
+    IMG_NAME = 'img'
+    LABEL_NAME = 'label'
+
+    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
+    # gradient should flow
+    img.stop_gradient = False
+    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
+    logits = mnist_cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # use CPU
+    place = fluid.CPUPlace()
+    # use GPU
+    # place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    BATCH_SIZE = 1
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.test(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    fluid.io.load_params(
+        exe, "./mnist/", main_program=fluid.default_main_program())
+
+    # advbox demo
+    m = PaddleModel(
+        fluid.default_main_program(),
+        IMG_NAME,
+        LABEL_NAME,
+        logits.name,
+        avg_cost.name, (-1, 1),
+        channel_axis=1)
+    attack = MIFGSM(m)
+    attack_config = {
+        "norm_ord": np.inf,
+        "epsilons": 0.1,
+        "steps": 100,
+        "decay_factor": 1
+    }
+
+    # use train data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in train_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+
+        # MIFGSM non-targeted attack
+        adversary = attack(adversary, **attack_config)
+
+        # MIFGSM targeted attack
+        # tlabel = 0
+        # adversary.set_target(is_targeted_attack=True, target_label=tlabel)
+        # adversary = attack(adversary, **attack_config)
+
+        if adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TRAIN_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+
+    # use test data to generate adversarial examples
+    total_count = 0
+    fooling_count = 0
+    for data in test_reader():
+        total_count += 1
+        adversary = Adversary(data[0][0], data[0][1])
+
+        # MIFGSM non-targeted attack
+        adversary = attack(adversary, **attack_config)
+
+        # MIFGSM targeted attack
+        # tlabel = 0
+        # adversary.set_target(is_targeted_attack=True, target_label=tlabel)
+        # adversary = attack(adversary, **attack_config)
+
+        if adversary.is_successful():
+            fooling_count += 1
+            print(
+                'attack success, original_label=%d, adversarial_label=%d, count=%d'
+                % (data[0][1], adversary.adversarial_label, total_count))
+            # plt.imshow(adversary.target, cmap='Greys_r')
+            # plt.show()
+            # np.save('adv_img', adversary.target)
+        else:
+            print('attack failed, original_label=%d, count=%d' %
+                  (data[0][1], total_count))
+
+        if total_count >= TOTAL_NUM:
+            print(
+                "[TEST_DATASET]: fooling_count=%d, total_count=%d, fooling_rate=%f"
+                % (fooling_count, total_count,
+                   float(fooling_count) / total_count))
+            break
+    print("mifgsm attack done")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fluid/image_classification/caffe2fluid/README.md b/fluid/image_classification/caffe2fluid/README.md
index 5f565afe0c33db291092faeac632da3d51f95613..9a6daad90222ab036cac896a66e50f273deac3d7 100644
--- a/fluid/image_classification/caffe2fluid/README.md
+++ b/fluid/image_classification/caffe2fluid/README.md
@@ -1,36 +1,76 @@
 ### Caffe2Fluid
-This tool is used to convert a Caffe model to Fluid model
+This tool is used to convert a Caffe model to a Fluid model
 
-### Howto
-1, Prepare caffepb.py in ./proto if your python has no 'pycaffe' module, two options provided here:
-
-    1) generate it from caffe.proto using protoc
+### HowTo
+1. Prepare caffepb.py in ./proto if your python has no 'pycaffe' module, two options provided here:
+    - Generate pycaffe from caffe.proto
+        ```
         bash ./proto/compile.sh
+        ```
 
-    2) download one from github directly
+    - Download one from github directly
+        ```
         cd proto/ && wget https://github.com/ethereon/caffe-tensorflow/blob/master/kaffe/caffe/caffepb.py
+        ```
+
+2. Convert the Caffe model to Fluid model
+   - Generate fluid code and weight file
+        ```
+        python convert.py alexnet.prototxt \
+                --caffemodel alexnet.caffemodel \
+                --data-output-path alexnet.npy \
+                --code-output-path alexnet.py
+     ```
+
+    - Save weights as fluid model file
+        ```
+        python alexnet.py alexnet.npy ./fluid #only infer the last layer's result
+        python alexnet.py alexnet.npy ./fluid fc8,prob #infer these 2 layer's result
+      ```
+
+3. Use the converted model to infer
+    - See more details in '*examples/imagenet/run.sh*'
+
+4. Compare the inference results with caffe
+    - See more details in '*examples/imagenet/diff.sh*'
+
+### How to convert custom layer
+1. Implement your custom layer in a file under '*kaffe/custom_layers*', eg: mylayer.py
+    - Implement ```shape_func(input_shape, [other_caffe_params])``` to calculate the output shape
+    - Implement ```layer_func(inputs, name, [other_caffe_params])``` to construct a fluid layer
+    - Register these two functions ```register(kind='MyType', shape=shape_func, layer=layer_func)```
+    - Notes: more examples can be found in '*kaffe/custom_layers*'
+
+2. Add ```import mylayer``` to  '*kaffe/custom_layers/\_\_init__.py*'
 
-2, Convert the caffe model using 'convert.py' which will generate a python script and a weight(in .npy) file
+3. Prepare your pycaffe as your customized version(same as previous env prepare)
+    - (option1) replace 'proto/caffe.proto' with your own caffe.proto and compile it
+    - (option2) change your pycaffe to the customized version
 
-3, Use the converted model to predict
+4. Convert the Caffe model to Fluid model
 
-    see more detail info in 'examples/xxx'
+5. Set env $CAFFE2FLUID_CUSTOM_LAYERS to the parent directory of 'custom_layers'
+   ```
+   export CAFFE2FLUID_CUSTOM_LAYERS=/path/to/caffe2fluid/kaffe
+   ```
 
+6. Use the converted model when loading model in 'xxxnet.py' and 'xxxnet.npy'(no need if model is already in 'fluid/model' and 'fluid/params')
 
 ### Tested models
-- Lenet on mnist dataset
+- Lenet:
+[model addr](https://github.com/ethereon/caffe-tensorflow/blob/master/examples/mnist)
 
 - ResNets:(ResNet-50, ResNet-101, ResNet-152)
-    model addr: `https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777`_
+[model addr](https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777)
 
 - GoogleNet:
-    model addr: `https://gist.github.com/jimmie33/7ea9f8ac0da259866b854460f4526034`_
+[model addr](https://gist.github.com/jimmie33/7ea9f8ac0da259866b854460f4526034)
 
 - VGG:
-    model addr: `https://gist.github.com/ksimonyan/211839e770f7b538e2d8`_
+[model addr](https://gist.github.com/ksimonyan/211839e770f7b538e2d8)
 
 - AlexNet:
-    model addr: `https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet`_
+[model addr](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet)
 
 ### Notes
-Some of this code come from here: https://github.com/ethereon/caffe-tensorflow
+Some of this code come from here: [caffe-tensorflow](https://github.com/ethereon/caffe-tensorflow)
diff --git a/fluid/image_classification/caffe2fluid/convert.py b/fluid/image_classification/caffe2fluid/convert.py
index 379f1a26368c9ffa4a9f82dad499ad7114f942fc..b0252e3c03db3626696a3672971f0704461417e7 100755
--- a/fluid/image_classification/caffe2fluid/convert.py
+++ b/fluid/image_classification/caffe2fluid/convert.py
@@ -43,11 +43,17 @@ def convert(def_path, caffemodel_path, data_output_path, code_output_path,
             print_stderr('Saving source...')
             with open(code_output_path, 'wb') as src_out:
                 src_out.write(transformer.transform_source())
+        print_stderr('set env variable before using converted model '\
+                'if used custom_layers:')
+        custom_pk_path = os.path.dirname(os.path.abspath(__file__))
+        custom_pk_path = os.path.join(custom_pk_path, 'kaffe')
+        print_stderr('export CAFFE2FLUID_CUSTOM_LAYERS=%s' % (custom_pk_path))
         print_stderr('Done.')
+        return 0
     except KaffeError as err:
         fatal_error('Error encountered: {}'.format(err))
 
-    return 0
+    return 1
 
 
 def main():
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/README.md b/fluid/image_classification/caffe2fluid/examples/imagenet/README.md
index b82050859239be8804ddec8e2054edc38c4ac052..b9cf1941d29428c84c34df2a9ec00d7ae8e79014 100644
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/README.md
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/README.md
@@ -1,10 +1,37 @@
-a demo to show converting caffe models on 'imagenet' using caffe2fluid
+A demo to show converting caffe models on 'imagenet' using caffe2fluid
 
 ---
 
 # How to use
 
-1. prepare python environment
-2. download caffe model to "models.caffe/xxx" which contains "xxx.caffemodel" and "xxx.prototxt"
-3. run the tool
-    eg: bash ./run.sh resnet50 ./models.caffe/resnet50 ./models/resnet50
+1. Prepare python environment
+
+2. Download caffe model to "models.caffe/xxx" which contains "xxx.caffemodel" and "xxx.prototxt"
+
+3. Convert the Caffe model to Fluid model
+    - generate fluid code and weight file
+    <pre><code>python convert.py alexnet.prototxt \
+        --caffemodel alexnet.caffemodel \
+        --data-output-path alexnet.npy \
+        --code-output-path alexnet.py
+    </code></pre>
+
+    - save weights as fluid model file
+    <pre><code>python alexnet.py alexnet.npy ./fluid_model
+    </code></pre>
+
+4. Do inference
+   <pre><code>python infer.py infer ./fluid_mode data/65.jpeg
+</code></pre>
+
+5. convert model and do inference together
+   <pre><code>bash ./run.sh alexnet ./models.caffe/alexnet ./models/alexnet
+</code></pre>
+    The Caffe model is stored in './models.caffe/alexnet/alexnet.prototxt|caffemodel'
+    and the Fluid model will be save in './models/alexnet/alexnet.py|npy'
+
+6. test the difference with caffe's results(need pycaffe installed)
+   <pre><code>bash ./diff.sh resnet
+</code></pre>
+Make sure your caffemodel stored in './models.caffe/resnet'.
+The results will be stored in './results/resnet.paddle|caffe'
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py b/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..07d4ed1af50a803aee206da6c7582d079a1a1dca
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py
@@ -0,0 +1,85 @@
+#!/usr/bin/python
+
+#
+#a tool to compare tensors in two files or two directories
+#
+
+import sys
+import os
+
+
+def walk_dir(rootdir):
+    for subdir, dirs, files in os.walk(rootdir):
+        for file in files:
+            yield file
+
+
+def calc_diff(f1, f2):
+    import numpy as np
+
+    d1 = np.load(f1).flatten()
+    d2 = np.load(f2).flatten()
+
+    d1_num = reduce(lambda x, y: x * y, d1.shape)
+    d2_num = reduce(lambda x, y: x * y, d2.shape)
+    if d1_num != d2_num:
+        print d1.shape
+        print d2.shape
+        assert (d1_num == d2_num), "their shape is not consistent"
+
+    try:
+        df = np.abs(d1 - d2)
+        max_df = np.max(df)
+        sq_df = np.mean(df * df)
+        return max_df, sq_df
+    except Exception as e:
+        return -1.0, -1.0
+
+
+def compare(path1, path2):
+    def diff(f1, f2):
+        max_df, sq_df = calc_diff(f1, f2)
+        print('compare %s <=> %s with result[max_df:%.4e, sq_df:%.4e]' %
+              (f1, f2, max_df, sq_df))
+        assert (max_df < 1e-5), \
+                'max_df is too large with value[%.6e]' % (max_df)
+        assert (sq_df < 1e-10), \
+                'sq_df is too large with value[%.6e]' % (sq_df)
+
+    if os.path.exists(path1) is False:
+        print('not found %s' % (path1))
+        return 1
+    elif os.path.exists(path2) is False:
+        print('not found %s' % (path2))
+        return 1
+
+    if path1.find('.npy') > 0 and path2.find('.npy') > 0:
+        diff(path1, path2)
+        return
+
+    for f in walk_dir(path2):
+        if f.find('.npy') < 0:
+            continue
+
+        f1 = os.path.join(path1, f)
+        f2 = os.path.join(path2, f)
+        diff(f1, f2)
+
+    print('all checking succeed to pass')
+    return 0
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        path1 = 'lenet.tf/results'
+        path2 = 'lenet.paddle/results'
+    elif len(sys.argv) == 3:
+        path1 = sys.argv[1]
+        path2 = sys.argv[2]
+    else:
+        print('usage:')
+        print(' %s [path1] [path2]' % (sys.argv[0]))
+        exit(1)
+
+    print('compare inner result in %s %s' % (path1, path2))
+    exit(compare(path1, path2))
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/diff.sh b/fluid/image_classification/caffe2fluid/examples/imagenet/diff.sh
new file mode 100755
index 0000000000000000000000000000000000000000..af72caea536d6b6c3d1027e7d1327af52a6ceda6
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/diff.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+#
+#function:
+#   a tool used to check the difference of models' results generated by caffe model and paddle model
+#
+#howto:
+#   bash diff.sh resnet50 #when this has been finished, you can get the difference in precision
+#
+#notes:
+#   0, in order to infer using caffe, we need pycaffe installed
+#   1, prepare your caffe model in 'models.caffe/', eg: 'model.caffe/resnet101/resnet101.[prototxt|caffemodel]'
+#   2, converted paddle model will be in 'models'
+#   3, results of layers will be stored in 'results/${model_name}.[paddle|caffe]'
+#   4, only the last layer will be checked by default
+
+model_name="resnet50"
+results_root="results/"
+
+if [[ -n $1 ]];then
+    if [ $1 = "-h" ];then
+        echo "usage:"
+        echo "  bash $0 [model_name]"
+        echo "  eg:bash $0 resnet50"
+        exit 0
+    fi
+    model_name=$1
+fi
+
+mkdir -p $results_root
+
+model_prototxt="models.caffe/$model_name/${model_name}.prototxt"
+model_caffemodel="models.caffe/${model_name}/${model_name}.caffemodel"
+
+#1, dump layers' results from paddle
+paddle_results="$results_root/${model_name}.paddle"
+rm -rf $paddle_results
+rm -rf "results.paddle"
+bash run.sh $model_name ./models.caffe/$model_name ./models/$model_name
+if [[ $? -ne 0 ]] || [[ ! -e "results.paddle" ]];then
+    echo "not found paddle's results, maybe failed to convert"
+    exit 1
+fi
+mv results.paddle $paddle_results
+
+#2, dump layers' results from caffe
+caffe_results="$results_root/${model_name}.caffe"
+rm -rf $caffe_results
+rm -rf "results.caffe"
+cfpython ./infer.py caffe $model_prototxt $model_caffemodel $paddle_results/data.npy
+if [[ $? -ne 0 ]] || [[ ! -e "results.caffe" ]];then
+    echo "not found caffe's results, maybe failed to do inference with caffe"
+    exit 1
+fi
+mv results.caffe $caffe_results
+
+#3, extract layer names
+cat $model_prototxt | grep name | perl -ne 'if(/^\s*name:\s+\"([^\"]+)/){ print $1."\n";}' >.layer_names
+
+#4, compare one by one
+for i in $(cat ".layer_names" | tail -n1);do
+    echo "process $i"
+    python compare.py $caffe_results/${i}.npy $paddle_results/${i}.npy
+done
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py b/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py
index ec594199be5a3e7a33c9673b1d5497c95f20d946..d71a91ad7e731e4585ae4adfb44b0a1019260e0d 100644
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py
@@ -10,8 +10,11 @@ import os
 import sys
 import inspect
 import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+
+
+def import_fluid():
+    import paddle.fluid as fluid
+    return fluid
 
 
 def load_data(imgfile, shape):
@@ -40,7 +43,7 @@ def build_model(net_file, net_name):
           (net_file, net_name))
 
     net_path = os.path.dirname(net_file)
-    module_name = os.path.basename(net_file).rstrip('.py')
+    module_name = os.path.splitext(os.path.basename(net_file))[0]
     if net_path not in sys.path:
         sys.path.insert(0, net_path)
 
@@ -48,23 +51,25 @@ def build_model(net_file, net_name):
         m = __import__(module_name, fromlist=[net_name])
         MyNet = getattr(m, net_name)
     except Exception as e:
-        print('failed to load module[%s]' % (module_name))
+        print('failed to load module[%s.%s]' % (module_name, net_name))
         print(e)
         return None
 
-    input_name = 'data'
-    input_shape = MyNet.input_shapes()[input_name]
-    images = fluid.layers.data(name='image', shape=input_shape, dtype='float32')
+    fluid = import_fluid()
+    inputs_dict = MyNet.input_shapes()
+    input_name = inputs_dict.keys()[0]
+    input_shape = inputs_dict[input_name]
+    images = fluid.layers.data(
+        name=input_name, shape=input_shape, dtype='float32')
     #label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     net = MyNet({input_name: images})
-    input_shape = MyNet.input_shapes()[input_name]
-    return net, input_shape
+    return net, inputs_dict
 
 
 def dump_results(results, names, root):
     if os.path.exists(root) is False:
-        os.path.mkdir(root)
+        os.mkdir(root)
 
     for i in range(len(names)):
         n = names[i]
@@ -73,23 +78,27 @@ def dump_results(results, names, root):
         np.save(filename + '.npy', res)
 
 
-def infer(net_file, net_name, model_file, imgfile, debug=False):
-    """ do inference using a model which consist 'xxx.py' and 'xxx.npy'
+def load_model(exe, place, net_file, net_name, net_weight, debug):
+    """ load model using xxxnet.py and xxxnet.npy
     """
+    fluid = import_fluid()
+
     #1, build model
-    net, input_shape = build_model(net_file, net_name)
+    net, input_map = build_model(net_file, net_name)
+    feed_names = input_map.keys()
+    feed_shapes = [v for k, v in input_map.items()]
+
     prediction = net.get_output()
 
     #2, load weights for this model
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
     startup_program = fluid.default_startup_program()
     exe.run(startup_program)
 
-    if model_file.find('.npy') > 0:
-        net.load(data_path=model_file, exe=exe, place=place)
+    #place = fluid.CPUPlace()
+    if net_weight.find('.npy') > 0:
+        net.load(data_path=net_weight, exe=exe, place=place)
     else:
-        net.load(data_path=model_file, exe=exe)
+        raise ValueError('not found weight file')
 
     #3, test this model
     test_program = fluid.default_main_program().clone()
@@ -103,18 +112,116 @@ def infer(net_file, net_name, model_file, imgfile, debug=False):
             fetch_list_var.append(v)
             fetch_list_name.append(k)
 
+    return {
+        'program': test_program,
+        'feed_names': feed_names,
+        'fetch_vars': fetch_list_var,
+        'fetch_names': fetch_list_name,
+        'feed_shapes': feed_shapes
+    }
+
+
+def get_shape(fluid, program, name):
+    for var in program.list_vars():
+        if var.name == 'data':
+            return list(var.shape[1:])
+
+    raise ValueError('not found shape for input layer[%s], '
+                     'you can specify by yourself' % (name))
+
+
+def load_inference_model(dirname, exe):
+    """ load fluid's inference model
+    """
+    fluid = import_fluid()
+    model_fn = 'model'
+    params_fn = 'params'
+    if os.path.exists(os.path.join(dirname, model_fn)) \
+            and os.path.exists(os.path.join(dirname, params_fn)):
+        program, feed_names, fetch_targets = fluid.io.load_inference_model(\
+                dirname, exe, model_fn, params_fn)
+    else:
+        raise ValueError('not found model files in direcotry[%s]' % (dirname))
+
+    #print fluid.global_scope().find_var(feed_names[0])
+    input_shape = get_shape(fluid, program, feed_names[0])
+    feed_shapes = [input_shape]
+
+    return program, feed_names, fetch_targets, feed_shapes
+
+
+def infer(model_path, imgfile, net_file=None, net_name=None, debug=True):
+    """ do inference using a model which consist 'xxx.py' and 'xxx.npy'
+    """
+    fluid = import_fluid()
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    try:
+        ret = load_inference_model(model_path, exe)
+        program, feed_names, fetch_targets, feed_shapes = ret
+        debug = False
+        print('found a inference model for fluid')
+    except ValueError as e:
+        print('try to load model using net file and weight file')
+        net_weight = model_path
+        ret = load_model(exe, place, net_file, net_name, net_weight, debug)
+        program = ret['program']
+        feed_names = ret['feed_names']
+        fetch_targets = ret['fetch_vars']
+        fetch_list_name = ret['fetch_names']
+        feed_shapes = ret['feed_shapes']
+
+    input_name = feed_names[0]
+    input_shape = feed_shapes[0]
+
     np_images = load_data(imgfile, input_shape)
-    results = exe.run(program=test_program,
-                      feed={'image': np_images},
-                      fetch_list=fetch_list_var)
+    results = exe.run(program=program,
+                      feed={input_name: np_images},
+                      fetch_list=fetch_targets)
 
     if debug is True:
-        dump_path = 'results.layers'
+        dump_path = 'results.paddle'
         dump_results(results, fetch_list_name, dump_path)
-        print('all results dumped to [%s]' % (dump_path))
+        print('all result of layers dumped to [%s]' % (dump_path))
     else:
         result = results[0]
-        print('predicted class:', np.argmax(result))
+        print('succeed infer with results[class:%d]' % (np.argmax(result)))
+
+    return 0
+
+
+def caffe_infer(prototxt, caffemodel, datafile):
+    """ do inference using pycaffe for debug,
+        all intermediate results will be dumpped to 'results.caffe'
+    """
+    import caffe
+
+    net = caffe.Net(prototxt, caffemodel, caffe.TEST)
+    input_layer = net.blobs.keys()[0]
+    print('got name of input layer is:%s' % (input_layer))
+    input_shape = list(net.blobs[input_layer].data.shape[1:])
+
+    if '.npy' in datafile:
+        np_images = np.load(datafile)
+    else:
+        np_images = load_data(datafile, input_shape)
+
+    inputs = {input_layer: np_images}
+    net.forward_all(**inputs)
+
+    results = []
+    names = []
+    for k, v in net.blobs.items():
+        k = k.rstrip('_output')
+        k = k.replace('/', '_')
+        names.append(k)
+        results.append(v.data.copy())
+
+    dump_path = 'results.caffe'
+    dump_results(results, names, dump_path)
+    print('all result of layers dumped to [%s]' % (dump_path))
+    return 0
 
 
 if __name__ == "__main__":
@@ -122,21 +229,50 @@ if __name__ == "__main__":
     """
     net_file = 'models/resnet50/resnet50.py'
     weight_file = 'models/resnet50/resnet50.npy'
-    imgfile = 'data/65.jpeg'
+    datafile = 'data/65.jpeg'
     net_name = 'ResNet50'
+    model_file = 'models/resnet50/fluid'
+
+    ret = None
+    if len(sys.argv) <= 2:
+        pass
+    elif sys.argv[1] == 'caffe':
+        if len(sys.argv) != 5:
+            print('usage:')
+            print('\tpython %s caffe [prototxt] [caffemodel] [datafile]' %
+                  (sys.argv[0]))
+            sys.exit(1)
+        prototxt = sys.argv[2]
+        caffemodel = sys.argv[3]
+        datafile = sys.argv[4]
+        ret = caffe_infer(prototxt, caffemodel, datafile)
+    elif sys.argv[1] == 'infer':
+        if len(sys.argv) != 4:
+            print('usage:')
+            print('\tpython %s infer [fluid_model] [datafile]' % (sys.argv[0]))
+            sys.exit(1)
+        model_path = sys.argv[2]
+        datafile = sys.argv[3]
+        ret = infer(model_path, datafile)
+    elif sys.argv[1] == 'dump':
+        if len(sys.argv) != 6:
+            print('usage:')
+            print('\tpython %s dump [net_file] [weight_file] [datafile] [net_name]' \
+                    % (sys.argv[0]))
+            print('\teg:python dump %s %s %s %s %s' % (sys.argv[0],\
+                net_file, weight_file, datafile, net_name))
+            sys.exit(1)
+
+        net_file = sys.argv[2]
+        weight_file = sys.argv[3]
+        datafile = sys.argv[4]
+        net_name = sys.argv[5]
+        ret = infer(weight_file, datafile, net_file, net_name)
 
-    argc = len(sys.argv)
-    if argc == 5:
-        net_file = sys.argv[1]
-        weight_file = sys.argv[2]
-        imgfile = sys.argv[3]
-        net_name = sys.argv[4]
-    elif argc > 1:
+    if ret is None:
         print('usage:')
-        print('\tpython %s [net_file] [weight_file] [imgfile] [net_name]' %
-              (sys.argv[0]))
-        print('\teg:python %s %s %s %s %s' % (sys.argv[0], net_file,
-                                              weight_file, imgfile, net_name))
+        print(' python %s [infer] [fluid_model] [imgfile]' % (sys.argv[0]))
+        print(' eg:python %s infer %s %s' % (sys.argv[0], model_file, datafile))
         sys.exit(1)
 
-    infer(net_file, net_name, weight_file, imgfile)
+    sys.exit(ret)
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh b/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh
old mode 100644
new mode 100755
index 7a1a5ebd7c0a5090c00a0c8ca6b0e11b110967dc..0fdd56e4519bf726a8e5bc95559d1d9b47f14774
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh
@@ -3,7 +3,7 @@
 #function:
 #   a tool used to:
 #       1, convert a caffe model
-#       2, do inference using this model
+#       2, do inference(only in fluid) using this model
 #
 #usage:
 #   bash run.sh resnet50 ./models.caffe/resnet50 ./models/resnet50
@@ -65,8 +65,13 @@ if [[ -z $only_convert ]];then
         PYTHON=`which python`
     fi
     imgfile="data/65.jpeg"
-    net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/\"([^\"]+)\"/){ print $1."\n";}'`
-    $PYTHON ./infer.py $net_file $weight_file $imgfile $net_name
+    #FIX ME:
+    #   only look the first line in prototxt file for the name of this network, maybe not correct
+    net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/^name\s*:\s*\"([^\"]+)\"/){ print $1."\n";}'`
+    if [[ -z $net_name ]];then
+        net_name="MyNet"
+    fi
+    $PYTHON ./infer.py dump $net_file $weight_file $imgfile $net_name
     ret=$?
 fi
 exit $ret
diff --git a/fluid/image_classification/caffe2fluid/examples/mnist/evaluate.py b/fluid/image_classification/caffe2fluid/examples/mnist/evaluate.py
index 5c86635d5a014262bdec40fe063915350c5fadb3..946fa943726b39c4e8e8dfce9f41c87a06ee1912 100644
--- a/fluid/image_classification/caffe2fluid/examples/mnist/evaluate.py
+++ b/fluid/image_classification/caffe2fluid/examples/mnist/evaluate.py
@@ -7,8 +7,8 @@
 import sys
 import os
 import numpy as np
+import paddle.fluid as fluid
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
 
 
 def test_model(exe, test_program, fetch_list, test_reader, feeder):
@@ -34,9 +34,6 @@ def evaluate(net_file, model_file):
 
     from lenet import LeNet as MyNet
 
-    with_gpu = False
-    paddle.init(use_gpu=with_gpu)
-
     #1, define network topology
     images = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@@ -45,7 +42,7 @@ def evaluate(net_file, model_file):
     prediction = net.layers['prob']
     acc = fluid.layers.accuracy(input=prediction, label=label)
 
-    place = fluid.CUDAPlace(0) if with_gpu is True else fluid.CPUPlace()
+    place = fluid.CPUPlace()
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
 
diff --git a/fluid/image_classification/caffe2fluid/examples/mnist/run.sh b/fluid/image_classification/caffe2fluid/examples/mnist/run.sh
old mode 100644
new mode 100755
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2276c09c2c408f4c6e65264b4bde91429df53ca
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py
@@ -0,0 +1,105 @@
+"""
+"""
+
+from .register import get_registered_layers
+#custom layer import begins
+
+import axpy
+import flatten
+import argmax
+
+#custom layer import ends
+
+custom_layers = get_registered_layers()
+
+
+def set_args(f, params):
+    """ set args for function 'f' using the parameters in node.layer.parameters
+
+    Args:
+        f (function): a python function object
+        params (object): a object contains attributes needed by f's arguments
+
+    Returns:
+        arg_names (list): a list of argument names
+        kwargs (dict): a dict contains needed arguments
+    """
+    argc = f.__code__.co_argcount
+    arg_list = f.__code__.co_varnames[0:argc]
+
+    kwargs = {}
+    for arg_name in arg_list:
+        try:
+            v = getattr(params, arg_name, None)
+        except Exception as e:
+            #maybe failed to extract caffe's parameters
+            v = None
+
+        if v is not None:
+            kwargs[arg_name] = v
+
+    return arg_list, kwargs
+
+
+def has_layer(kind):
+    """ test whether this layer exists in custom layer
+    """
+    return kind in custom_layers
+
+
+def compute_output_shape(kind, node):
+    assert kind in custom_layers, "layer[%s] not exist in custom layers" % (
+        kind)
+    shape_func = custom_layers[kind]['shape']
+
+    parents = node.parents
+    inputs = [list(p.output_shape) for p in parents]
+    arg_names, kwargs = set_args(shape_func, node.layer.parameters)
+
+    if len(inputs) == 1:
+        inputs = inputs[0]
+
+    return shape_func(inputs, **kwargs)
+
+
+def make_node(template, kind, node):
+    """ make a TensorFlowNode for custom layer which means construct
+        a piece of code to define a layer implemented in 'custom_layers'
+
+    Args:
+        @template (TensorFlowNode): a factory to new a instance of TensorFLowNode
+        @kind (str): type of custom layer
+        @node (graph.Node): a layer in the net
+
+    Returns:
+        instance of TensorFlowNode
+    """
+    assert kind in custom_layers, "layer[%s] not exist in custom layers" % (
+        kind)
+
+    layer_func = custom_layers[kind]['layer']
+
+    #construct arguments needed by custom layer function from node's parameters
+    arg_names, kwargs = set_args(layer_func, node.layer.parameters)
+
+    return template('custom_layer', kind, **kwargs)
+
+
+def make_custom_layer(kind, inputs, name, *args, **kwargs):
+    """ execute a custom layer which is implemented by users
+
+    Args:
+        @kind (str): type name of this layer
+        @inputs (vars): variable list created by fluid
+        @namme (str): name for this layer
+        @args (tuple): other positional arguments
+        @kwargs (dict): other kv arguments
+
+    Returns:
+        output (var): output variable for this layer
+    """
+    assert kind in custom_layers, "layer[%s] not exist in custom layers" % (
+        kind)
+
+    layer_func = custom_layers[kind]['layer']
+    return layer_func(inputs, name, *args, **kwargs)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/argmax.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/argmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d884f53a1027d091fe409632209a2d9a579f573
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/argmax.py
@@ -0,0 +1,71 @@
+""" a custom layer for 'argmax', maybe we should implement this in standard way.
+    more info can be found here: http://caffe.berkeleyvision.org/tutorial/layers/argmax.html
+"""
+from .register import register
+
+
+def import_fluid():
+    import paddle.fluid as fluid
+    return fluid
+
+
+def argmax_shape(input_shape, out_max_val=False, top_k=1, axis=-1):
+    """ calculate the output shape of this layer using input shape
+
+    Args:
+        @input_shape (list of num): a list of number which represents the input shape
+        @out_max_val (bool): parameter from caffe's ArgMax layer
+        @top_k (int): parameter from caffe's ArgMax layer
+        @axis (int): parameter from caffe's ArgMax layer
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+    input_shape = list(input_shape)
+
+    if axis < 0:
+        axis += len(input_shape)
+
+    assert (axis + 1 == len(input_shape)
+            ), 'only can be applied on the last dimension[axis:%d, %s] now,'\
+                    'make sure you have set axis param in xxx.prototxt file' \
+                    % (axis, str(input_shape))
+
+    output_shape = input_shape
+    output_shape[-1] = top_k
+    if out_max_val is True:
+        output_shape[-1] *= 2
+
+    return output_shape
+
+
+def argmax_layer(input, name, out_max_val=False, top_k=1, axis=-1):
+    """ build a layer of type 'ArgMax' using fluid
+
+    Args:
+        @input (variable): input fluid variable for this layer
+        @name (str): name for this layer
+        @out_max_val (bool): parameter from caffe's ArgMax layer
+        @top_k (int): parameter from caffe's ArgMax layer
+        @axis (int): parameter from caffe's ArgMax layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+
+    fluid = import_fluid()
+
+    if axis < 0:
+        axis += len(input.shape)
+
+    topk_var, index_var = fluid.layers.topk(input=input, k=top_k)
+    if out_max_val is True:
+        index_var = fluid.layers.cast(index_var, dtype=topk_var.dtype)
+        output = fluid.layers.concat([index_var, topk_var], axis=axis)
+    else:
+        output = index_var
+
+    return output
+
+
+register(kind='ArgMax', shape=argmax_shape, layer=argmax_layer)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/axpy.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/axpy.py
new file mode 100644
index 0000000000000000000000000000000000000000..389bb7996e87b2813a7704ef5e0c14332f95ab08
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/axpy.py
@@ -0,0 +1,51 @@
+""" A custom layer for 'axpy' which receives 3 tensors and output 1 tensor.
+    the function performed is:(the mupltiplication and add are elementewise)
+        output = inputs[0] * inputs[1] + inputs[2]
+"""
+
+from .register import register
+
+
+def axpy_shape(input_shapes):
+    """ calculate the output shape of this layer using input shapes
+
+    Args:
+        @input_shapes (list of tuples): a list of input shapes
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+    assert len(input_shapes) == 3, "not valid input shape for axpy layer"
+    assert len(input_shapes[0]) == len(input_shapes[1]), 'should have same dims'
+
+    output_shape = input_shapes[1]
+    assert (input_shapes[2] == output_shape),\
+            "shape not consistent for axpy[%s <--> %s]" \
+            % (str(output_shape), str(input_shapes[2]))
+
+    return output_shape
+
+
+def axpy_layer(inputs, name):
+    """ build a layer of type 'Axpy' using fluid
+
+    Args:
+        @inputs (list of variables): input fluid variables for this layer
+        @name (str): name for this layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+    import paddle.fluid as fluid
+
+    assert len(inputs) == 3, "invalid inputs for axpy[%s]" % (name)
+    alpha = inputs[0]
+    x = inputs[1]
+    y = inputs[2]
+    output = fluid.layers.elementwise_mul(x, alpha, axis=0)
+    output = fluid.layers.elementwise_add(output, y)
+
+    return output
+
+
+register(kind='Axpy', shape=axpy_shape, layer=axpy_layer)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f7af4266f7fd4b7b6e8ee868f44f1b35f35cb00
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py
@@ -0,0 +1,73 @@
+""" a custom layer for 'flatten', maybe we should implement this in standard way.
+    more info can be found here: http://caffe.berkeleyvision.org/tutorial/layers/flatten.html
+"""
+from .register import register
+
+
+def import_fluid():
+    import paddle.fluid as fluid
+    return fluid
+
+
+def flatten_shape(input_shape, axis=1, end_axis=-1):
+    """ calculate the output shape of this layer using input shape
+
+    Args:
+        @input_shape (list of num): a list of number which represents the input shape
+        @axis (int): parameter from caffe's Flatten layer
+        @end_axis (int): parameter from caffe's Flatten layer
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+
+    start_axis = axis
+    end_axis = end_axis
+    input_shape = list(input_shape)
+    if start_axis < 0:
+        start_axis += len(input_shape)
+
+    if end_axis < 0:
+        end_axis += len(input_shape)
+
+    assert start_axis <= end_axis, 'invalid axis[%d] or end_axis[%d] params'\
+            % (start_axis, end_axis)
+    output_shape = input_shape[0:start_axis]
+    flat_sz = reduce(lambda a, b: a * b, input_shape[start_axis:end_axis])
+    output_shape += [flat_sz]
+    output_shape += input_shape[end_axis:-1]
+
+    return output_shape
+
+
+def flatten_layer(input, name, axis=1, end_axis=-1):
+    """ build a layer of type 'Flatten' using fluid
+
+    Args:
+        @input (variable): input fluid variable for this layer
+        @name (str): name for this layer
+        @axis (int): parameter from caffe's Flatten layer
+        @end_axis (int): parameter from caffe's Flatten layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+    fluid = import_fluid()
+
+    input_shape = list(input.shape)
+    dims = len(input_shape)
+    start_axis = axis if axis >= 0 else axis + dims
+    end_axis = end_axis if end_axis >= 0 else end_axis + dims
+
+    assert start_axis <= end_axis, 'invalid axis or end_axis params'
+    output_shape = input_shape[0:start_axis]
+    flat_sz = reduce(lambda a, b: a * b, input_shape[start_axis:end_axis])
+    output_shape += [flat_sz]
+    output_shape += input_shape[end_axis:-1]
+
+    output = fluid.layers.reshape(input, shape=output_shape, name=name)
+
+    return output
+
+
+register(kind='Flatten', shape=flatten_shape, layer=flatten_layer)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/register.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/register.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae806cd469cb763dd06bbe406abb2ced3419cffc
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/register.py
@@ -0,0 +1,37 @@
+""" this module provides 'register' for registering customized layers
+"""
+
+g_custom_layers = {}
+
+
+def register(kind, shape, layer):
+    """ register a custom layer or a list of custom layers
+
+    Args:
+        @kind (str or list): type name of the layer
+        @shape (function): a function to generate the shape of layer's output
+        @layer (function): a function to generate the shape of layer's output
+
+    Returns:
+        None
+    """
+    assert type(shape).__name__ == 'function', 'shape should be a function'
+    assert type(layer).__name__ == 'function', 'layer should be a function'
+
+    if type(kind) is str:
+        kind = [kind]
+    else:
+        assert type(
+            kind) is list, 'invalid param "kind" for register, not a list or str'
+
+    for k in kind:
+        assert type(
+            k) is str, 'invalid param "kind" for register, not a list of str'
+        assert k not in g_custom_layers, 'this type[%s] has already been registered' % (
+            k)
+        print('register layer[%s]' % (k))
+        g_custom_layers[k] = {'shape': shape, 'layer': layer}
+
+
+def get_registered_layers():
+    return g_custom_layers
diff --git a/fluid/image_classification/caffe2fluid/kaffe/graph.py b/fluid/image_classification/caffe2fluid/kaffe/graph.py
index 5387f441852b8a318a41898ee0b62b4903ccdabb..6182a5352dac4746c64ebef0b3a886399dbd3d57 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/graph.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/graph.py
@@ -3,7 +3,7 @@ from google.protobuf import text_format
 from .caffe import get_caffe_resolver
 from .errors import KaffeError, print_stderr
 from .layers import LayerAdapter, LayerType, NodeKind, NodeDispatch
-from .shapes import TensorShape
+from .shapes import make_tensor
 
 
 class Node(object):
@@ -52,7 +52,10 @@ class Graph(object):
     def __init__(self, nodes=None, name=None):
         self.nodes = nodes or []
         self.node_lut = {node.name: node for node in self.nodes}
-        self.name = name
+        if name is None or name == '':
+            self.name = 'MyNet'
+        else:
+            self.name = name
 
     def add_node(self, node):
         self.nodes.append(node)
@@ -95,7 +98,7 @@ class Graph(object):
     def compute_output_shapes(self):
         sorted_nodes = self.topologically_sorted()
         for node in sorted_nodes:
-            node.output_shape = TensorShape(
+            node.output_shape = make_tensor(
                 *NodeKind.compute_output_shape(node))
 
     def replaced(self, new_nodes):
@@ -108,6 +111,7 @@ class Graph(object):
             if graph is None:
                 raise KaffeError('Transformer failed: {}'.format(transformer))
             assert isinstance(graph, Graph)
+
         return graph
 
     def __contains__(self, key):
@@ -120,10 +124,18 @@ class Graph(object):
         for node in self.topologically_sorted():
             # If the node has learned parameters, display the first one's shape.
             # In case of convolutions, this corresponds to the weights.
-            data_shape = node.data[0].shape if node.data else '--'
-            out_shape = node.output_shape or '--'
-            s.append('{:<20} {:<30} {:>20} {:>20}'.format(
-                node.kind, node.name, data_shape, tuple(out_shape)))
+            if node.data is None:
+                data_shape = '--'
+                out_shape = node.output_shape or '--'
+                s.append('{:<20} {:<30} {:>20} {:>20}'.format(
+                    node.kind, node.name, data_shape, tuple(out_shape)))
+            else:
+                for d in node.data:
+                    #data_shape = node.data[0].shape if node.data else '--'
+                    data_shape = d.shape
+                    out_shape = node.output_shape or '--'
+                    s.append('{:<20} {:<30} {:>20} {:>20}'.format(
+                        node.kind, node.name, data_shape, tuple(out_shape)))
         return '\n'.join(s)
 
 
@@ -234,6 +246,7 @@ class GraphBuilder(object):
                 if (parent_node is None) or (parent_node == node):
                     parent_node = graph.get_node(input_name)
                 node.add_parent(parent_node)
+
             if len(layer.top) > 1:
                 raise KaffeError('Multiple top nodes are not supported.')
 
diff --git a/fluid/image_classification/caffe2fluid/kaffe/layers.py b/fluid/image_classification/caffe2fluid/kaffe/layers.py
index f263407ab41458573f2df775f99202bed0e9d894..dcdd26040b6918d524f1d5ae58aa92f6da1a9550 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/layers.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/layers.py
@@ -2,6 +2,7 @@ import re
 import numbers
 from collections import namedtuple
 
+import custom_layers
 from .shapes import *
 
 LAYER_DESCRIPTORS = {
@@ -116,6 +117,9 @@ def get_v1_layer_map():
 class NodeKind(LayerType):
     @staticmethod
     def map_raw_kind(kind):
+        if custom_layers.has_layer(kind):
+            return kind
+
         if kind in LAYER_TYPES:
             return kind
 
@@ -127,6 +131,9 @@ class NodeKind(LayerType):
 
     @staticmethod
     def compute_output_shape(node):
+        if custom_layers.has_layer(node.kind):
+            return custom_layers.compute_output_shape(node.kind, node)
+
         try:
             val = LAYER_DESCRIPTORS[node.kind](node)
             return val
@@ -137,14 +144,13 @@ class NodeKind(LayerType):
 
 
 class NodeDispatchError(KaffeError):
-
     pass
 
 
 class NodeDispatch(object):
     @staticmethod
     def get_handler_name(node_kind):
-        if len(node_kind) <= 4:
+        if len(node_kind) <= 6:
             # A catch-all for things like ReLU and tanh
             return node_kind.lower()
         # Convert from CamelCase to under_scored
@@ -152,6 +158,9 @@ class NodeDispatch(object):
         return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
 
     def get_handler(self, node_kind, prefix):
+        if custom_layers.has_layer(node_kind):
+            return getattr(self, 'map_custom')
+
         name = self.get_handler_name(node_kind)
         name = '_'.join((prefix, name))
         try:
@@ -174,8 +183,10 @@ class LayerAdapter(object):
         try:
             return getattr(self.layer, name)
         except AttributeError:
+            print(dir(self.layer))
             raise NodeDispatchError(
-                'Caffe parameters not found for layer kind: %s' % (self.kind))
+                'Caffe parameters not found attr[%s] for layer kind[%s]' %
+                (name, self.kind))
 
     @staticmethod
     def get_kernel_value(scalar, repeated, idx, default=None):
diff --git a/fluid/image_classification/caffe2fluid/kaffe/net_template.py b/fluid/image_classification/caffe2fluid/kaffe/net_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..e57caf97948a903b02a136a38b0a0b716ac49057
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/net_template.py
@@ -0,0 +1,151 @@
+""" this module is used as a template for generating sub class of Network
+"""
+
+
+class MyNet(object):
+    ### automatically generated by caffe2fluid ###
+    inputs_info = "INPUTS_INFO"
+    custom_layers_path = "CAFFE2FLUID_CUSTOM_LAYERS"
+
+    def custom_layer_factory(self):
+        import os
+
+        pk_paths = []
+        default = os.path.dirname(os.path.abspath(__file__))
+        location = os.environ.get('CAFFE2FLUID_CUSTOM_LAYERS', default)
+        pk_name = 'custom_layers'
+        pk_dir = os.path.join(location, pk_name)
+        pk_paths.append((location, pk_dir))
+
+        location = MyNet.custom_layers_path
+        pk_dir = os.path.join(MyNet.custom_layers_path, pk_name)
+        pk_paths.append((location, pk_dir))
+
+        for loc, pk_dir in pk_paths:
+            if os.path.exists(pk_dir):
+                if loc not in sys.path:
+                    sys.path.insert(0, loc)
+                    break
+
+        try:
+            from custom_layers import make_custom_layer
+            return make_custom_layer
+        except Exception as e:
+            print('maybe you should set $CAFFE2FLUID_CUSTOM_LAYERS first')
+            raise e
+
+    @classmethod
+    def input_shapes(cls):
+        return cls.inputs_info
+
+    @classmethod
+    def convert(cls, npy_model, fluid_path, outputs=None):
+        fluid = import_fluid()
+        shapes = cls.input_shapes()
+        input_name = shapes.keys()[0]
+        feed_data = {}
+        for name, shape in shapes.items():
+            data_layer = fluid.layers.data(
+                name=name, shape=shape, dtype="float32")
+            feed_data[name] = data_layer
+
+        net = cls(feed_data)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        net.load(data_path=npy_model, exe=exe, place=place)
+        output_vars = []
+        if outputs is None:
+            output_vars.append(net.get_output())
+        else:
+            if type(outputs) is list:
+                for n in outputs:
+                    assert n in net.layers, 'not found layer with this name[%s]' % (
+                        n)
+                    output_vars.append(net.layers[n])
+
+        fluid.io.save_inference_model(
+            fluid_path, [input_name],
+            output_vars,
+            exe,
+            main_program=None,
+            model_filename='model',
+            params_filename='params')
+        return 0
+
+
+def main():
+    """ a tool used to convert caffe model to fluid
+    """
+
+    import sys
+    import os
+    filename = os.path.splitext(os.path.basename(sys.argv[0]))[0]
+    if len(sys.argv) < 3:
+        print('usage:')
+        print('	python %s %s.npy [save_dir] [layer names seperated by comma]' \
+                % (sys.argv[0], filename))
+        print('	eg: python %s %s.npy ./fluid' % (sys.argv[0], filename))
+        print('	eg: python %s %s.npy ./fluid layer_name1,layer_name2' \
+                % (sys.argv[0], filename))
+        return 1
+
+    npy_weight = sys.argv[1]
+    fluid_model = sys.argv[2]
+    outputs = None
+    if len(sys.argv) >= 4:
+        outputs = sys.argv[3].split(',')
+
+    ret = MyNet.convert(npy_weight, fluid_model, outputs)
+    if ret == 0:
+        outputs = 'last output layer' if outputs is None else outputs
+        print('succeed to convert to fluid format with output layers[%s]'
+              ' in directory[%s]' % (outputs, fluid_model))
+    else:
+        print('failed to convert model to fluid format')
+
+    return ret
+
+
+def generate_net_code(net_name, inputs_info):
+    """ generate framework of a custom net code which represent a subclass of Network
+
+    Args:
+        @net_name (str): class name for this net
+        @inputs_info (str): a str which represents a dict,  eg: '{"data": [3, 32, 32]}'
+    Returns:
+        net_codes (str): codes for this subclass
+    """
+    import os
+    import inspect
+
+    net_codes = str(inspect.getsource(MyNet))
+    net_codes = net_codes.replace('MyNet(object)', '%s(Network)' % net_name)
+    net_codes = net_codes.replace('"INPUTS_INFO"', inputs_info)
+
+    custom_layer_dir = os.path.dirname(os.path.abspath(__file__))
+    net_codes = net_codes.replace('CAFFE2FLUID_CUSTOM_LAYERS', custom_layer_dir)
+    return net_codes
+
+
+def generate_main_code(net_name):
+    """ generate a piece of code for 'main' function
+
+    Args:
+        @net_name (str): class name for this net
+
+    Returns:
+        main_codes (str): codes for this main function
+    """
+    import inspect
+
+    main_codes = str(inspect.getsource(main))
+    main_codes = main_codes.replace('MyNet', net_name)
+    return main_codes
+
+
+if __name__ == "__main__":
+    """ just for testing
+    """
+    print generate_net_code('Attribute', "{'data': [3, 277, 277]}")
+    print generate_main_code('Attribute')
diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
index fd6a71cb6acbfffe2aed1d3680fb91c8c85dc3d3..258830bdac00af8fb9f2e83207730b404a04f7d5 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
@@ -1,10 +1,11 @@
-import math
+import sys
 import os
+import math
 import numpy as np
 
 
 def import_fluid():
-    import paddle.v2.fluid as fluid
+    import paddle.fluid as fluid
     return fluid
 
 
@@ -64,7 +65,7 @@ class Network(object):
         if os.path.isdir(data_path):
             assert (exe is not None), \
                 'must provide a executor to load fluid model'
-            fluid.io.load_persistables_if_exist(executor=exe, dirname=data_path)
+            fluid.io.load_persistables(executor=exe, dirname=data_path)
             return True
 
         #load model from a npy file
@@ -161,57 +162,53 @@ class Network(object):
         output = fluid.layers.relu(x=input)
         return output
 
-    def _adjust_pad_if_needed(self, i_hw, k_hw, s_hw, p_hw):
-        #adjust the padding if needed
-        i_h, i_w = i_hw
-        k_h, k_w = k_hw
-        s_h, s_w = s_hw
-        p_h, p_w = p_hw
-
-        def is_consistent(i, k, s, p):
-            o = i + 2 * p - k
-            if o % s == 0:
-                return True
-            else:
-                return False
-
-        real_p_h = 0
-        real_p_w = 0
-        if is_consistent(i_h, k_h, s_h, p_h) is False:
-            real_p_h = int(k_h / 2)
-
-        if is_consistent(i_w, k_w, s_w, p_w) is False:
-            real_p_w = int(k_w / 2)
-
-        return [real_p_h, real_p_w]
-
-    def pool(self, pool_type, input, k_h, k_w, s_h, s_w, name, padding):
+    def pool(self, pool_type, input, k_h, k_w, s_h, s_w, ceil_mode, padding,
+             name):
         # Get the number of channels in the input
         in_hw = input.shape[2:]
         k_hw = [k_h, k_w]
         s_hw = [s_h, s_w]
 
-        if padding is None:
-            #fix bug about the difference between conv and pool
-            #more info: https://github.com/BVLC/caffe/issues/1318
-            padding = self._adjust_pad_if_needed(in_hw, k_hw, s_hw, [0, 0])
-
         fluid = import_fluid()
         output = fluid.layers.pool2d(
             input=input,
             pool_size=k_hw,
             pool_stride=s_hw,
             pool_padding=padding,
+            ceil_mode=ceil_mode,
             pool_type=pool_type)
         return output
 
     @layer
-    def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
-        return self.pool('max', input, k_h, k_w, s_h, s_w, name, padding)
+    def max_pool(self,
+                 input,
+                 k_h,
+                 k_w,
+                 s_h,
+                 s_w,
+                 ceil_mode,
+                 padding=[0, 0],
+                 name=None):
+        return self.pool('max', input, k_h, k_w, s_h, s_w, ceil_mode, padding,
+                         name)
+
+    @layer
+    def avg_pool(self,
+                 input,
+                 k_h,
+                 k_w,
+                 s_h,
+                 s_w,
+                 ceil_mode,
+                 padding=[0, 0],
+                 name=None):
+        return self.pool('avg', input, k_h, k_w, s_h, s_w, ceil_mode, padding,
+                         name)
 
     @layer
-    def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
-        return self.pool('avg', input, k_h, k_w, s_h, s_w, name, padding)
+    def sigmoid(self, input, name):
+        fluid = import_fluid()
+        return fluid.layers.sigmoid(input)
 
     @layer
     def lrn(self, input, radius, alpha, beta, name, bias=1.0):
@@ -258,7 +255,12 @@ class Network(object):
         return output
 
     @layer
-    def batch_normalization(self, input, name, scale_offset=True, relu=False):
+    def batch_normalization(self,
+                            input,
+                            name,
+                            scale_offset=True,
+                            eps=1e-5,
+                            relu=False):
         # NOTE: Currently, only inference is supported
         fluid = import_fluid()
         prefix = name + '_'
@@ -276,7 +278,7 @@ class Network(object):
             bias_attr=bias_attr,
             moving_mean_name=mean_name,
             moving_variance_name=variance_name,
-            epsilon=1e-5,
+            epsilon=eps,
             act='relu' if relu is True else None)
 
         return output
@@ -287,3 +289,16 @@ class Network(object):
         output = fluid.layers.dropout(
             input, dropout_prob=drop_prob, is_test=is_test, name=name)
         return output
+
+    def custom_layer_factory(self):
+        """ get a custom layer maker provided by subclass
+        """
+        raise NotImplementedError(
+            '[custom_layer_factory] must be implemented by the subclass.')
+
+    @layer
+    def custom_layer(self, inputs, kind, name, *args, **kwargs):
+        """ make custom layer
+        """
+        layer_factory = self.custom_layer_factory()
+        return layer_factory(kind, inputs, name, *args, **kwargs)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
index 4d7ec49a39199bb1415f830d88f89e93a4b95266..6aa3b38531f946e4656e05c52c69087f3b89aaf4 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
@@ -109,9 +109,17 @@ class TensorFlowMapper(NodeMapper):
             # Stochastic pooling, for instance.
             raise KaffeError('Unsupported pooling type.')
         (kernel_params, padding) = self.get_kernel_params(node)
+        ceil_mode = getattr(node.layer.parameters, 'ceil_mode', True)
         return TensorFlowNode(pool_op, kernel_params.kernel_h,
                               kernel_params.kernel_w, kernel_params.stride_h,
-                              kernel_params.stride_w, **padding)
+                              kernel_params.stride_w, ceil_mode, **padding)
+
+    def map_sigmoid(self, node):
+        return TensorFlowNode('sigmoid')
+
+    def map_custom(self, node):
+        from .. import custom_layers
+        return custom_layers.make_node(TensorFlowNode, node.kind, node)
 
     def map_inner_product(self, node):
         #TODO: Axis
@@ -142,7 +150,13 @@ class TensorFlowMapper(NodeMapper):
 
     def map_batch_norm(self, node):
         scale_offset = len(node.data) == 4
-        kwargs = {} if scale_offset else {'scale_offset': False}
+
+        #this default value comes from caffe's param in batch_norm
+        default_eps = 1e-5
+        kwargs = {'scale_offset': scale_offset}
+        if node.parameters.eps != default_eps:
+            kwargs['eps'] = node.parameters.eps
+
         return MaybeActivated(
             node, default=False)('batch_normalization', **kwargs)
 
@@ -184,18 +198,10 @@ class TensorFlowEmitter(object):
         codes.append(network_source + '\n')
         return self.statement('\n'.join(codes))
 
-    def emit_class_def(self, name):
-        return self.statement('class %s(Network):' % (name))
-
     def emit_setup_def(self):
         return self.statement('def setup(self):')
 
-    def emit_shape_def(self, input_nodes):
-        self.outdent()
-        func_def = self.statement('@classmethod')
-        func_def += self.statement('def input_shapes(cls):')
-        self.indent()
-
+    def get_inputs_info(self, input_nodes):
         input_shapes = {}
         for n in input_nodes:
             name = n.name
@@ -204,42 +210,7 @@ class TensorFlowEmitter(object):
             input_shapes[name] = ', '.join(shape)
         input_shapes = ['"%s": [%s]' % (n, l) for n, l in input_shapes.items()]
         shape_str = ','.join(input_shapes)
-        func_def += self.statement('return {%s}' % (shape_str))
-        return '\n\n' + func_def
-
-    def emit_convert_def(self, input_nodes):
-        codes = []
-        inputs = {}
-        codes.append('shapes = cls.input_shapes()')
-        for n in input_nodes:
-            name = n.name
-            layer_var = name + '_layer'
-            layer_def = '%s = fluid.layers.data(name="%s", shape=shapes["%s"],'\
-                    ' dtype="float32")' % (layer_var, name, name)
-            #layer_var, layer_def = data_layer_def(n.name, n.output_shape)
-            codes.append(layer_def)
-            inputs[name] = layer_var
-
-        input_dict = ','.join(['"%s": %s' % (n, l) for n, l in inputs.items()])
-
-        codes.append('feed_data = {' + input_dict + '}')
-        codes.append('net = cls(feed_data)')
-
-        codes.append("place = fluid.CPUPlace()")
-        codes.append("exe = fluid.Executor(place)")
-        codes.append("exe.run(fluid.default_startup_program())")
-        codes.append("net.load(data_path=npy_model, exe=exe, place=place)")
-        codes.append(
-            "fluid.io.save_persistables(executor=exe, dirname=fluid_path)")
-
-        self.outdent()
-        func_def = self.statement('@classmethod')
-        func_def += self.statement('def convert(cls, npy_model, fluid_path):')
-        self.indent()
-        func_def += self.statement('import paddle.v2.fluid as fluid')
-        for l in codes:
-            func_def += self.statement(l)
-        return '\n' + func_def
+        return '{%s}' % (shape_str)
 
     def emit_main_def(self, name):
         if name is None:
@@ -248,13 +219,7 @@ class TensorFlowEmitter(object):
         self.prefix = ''
         main_def = self.statement('if __name__ == "__main__":')
         self.indent()
-        main_def += self.statement("#usage: python xxxnet.py xxx.npy ./model\n")
-        main_def += self.statement("import sys")
-        main_def += self.statement("npy_weight = sys.argv[1]")
-        main_def += self.statement("fluid_model = sys.argv[2]")
-        main_def += self.statement("%s.convert(npy_weight, fluid_model)" %
-                                   (name))
-        main_def += self.statement("exit(0)")
+        main_def += self.statement('exit(main())')
         return '\n\n' + main_def
 
     def emit_parents(self, chain):
@@ -269,10 +234,17 @@ class TensorFlowEmitter(object):
         return self.statement('self.' + node.emit())
 
     def emit(self, name, chains, input_nodes=None):
+        from ..net_template import generate_net_code
+        from ..net_template import generate_main_code
+
         self.net_name = name
+        inputs_info = self.get_inputs_info(input_nodes)
+
         s = self.emit_imports()
-        s += self.emit_class_def(name)
+        s += generate_net_code(name, inputs_info) + '\n'
         self.indent()
+
+        # define the net using api
         s += self.emit_setup_def()
         self.indent()
         blocks = []
@@ -283,8 +255,9 @@ class TensorFlowEmitter(object):
                 b += self.emit_node(node)
             blocks.append(b[:-1])
         s = s + '\n\n'.join(blocks)
-        s += self.emit_shape_def(input_nodes)
-        s += self.emit_convert_def(input_nodes)
+
+        # define the main function
+        s += '\n\n\n' + generate_main_code(name)
         s += self.emit_main_def(name)
         return s
 
@@ -323,6 +296,7 @@ class Transformer(object):
             # (Caffe's GoogLeNet implementation uses slashes)
             NodeRenamer(lambda node: node.name.replace('/', '_'))
         ]
+
         self.graph = graph.transformed(transformers)
 
         # Display the graph
@@ -334,9 +308,6 @@ class Transformer(object):
             transformers = [
                 # Reshape the parameters to TensorFlow's ordering
                 DataReshaper({
-                    # (c_o, c_i, h, w) -> (h, w, c_i, c_o) for TF
-                    NodeKind.Convolution: (0, 1, 2, 3),
-
                     # (c_o, c_i) -> (c_i, c_o)
                     NodeKind.InnerProduct: (1, 0)
                 }),
diff --git a/fluid/image_classification/caffe2fluid/kaffe/shapes.py b/fluid/image_classification/caffe2fluid/kaffe/shapes.py
index e8124730c66eaecb85f7aff58e08f6dc16668343..a2ce26362bb9afd659f8db7d678afeabd3efa6b5 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/shapes.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/shapes.py
@@ -3,8 +3,24 @@ from collections import namedtuple
 
 from .errors import KaffeError
 
-TensorShape = namedtuple('TensorShape',
-                         ['batch_size', 'channels', 'height', 'width'])
+Tensor4DShape = namedtuple('Tensor4DShape',
+                           ['batch_size', 'channels', 'height', 'width'])
+
+Tensor2DShape = namedtuple('Tensor2DShape', ['batch_size', 'data'])
+
+ScalarShape = namedtuple('ScalarShape', ['batch_size'])
+
+
+def make_tensor(batch_size, d1=None, d2=None, d3=None):
+    if d3 is not None:
+        return Tensor4DShape(batch_size, d1, d2, d3)
+    elif d1 is not None and d2 is None:
+        return Tensor2DShape(batch_size, d1)
+    elif d1 is None and d2 is None and d3 is None:
+        return ScalarShape(batch_size)
+    else:
+        raise NotImplementedError('invalid params for make_tensor %s' \
+                % (str((batch_size, d1, d2, d3))))
 
 
 def get_filter_output_shape(i_h, i_w, params, round_func):
@@ -23,7 +39,7 @@ def get_strided_kernel_output_shape(node, round_func):
     params = node.layer.parameters
     has_c_o = hasattr(params, 'num_output')
     c = params.num_output if has_c_o else input_shape.channels
-    return TensorShape(input_shape.batch_size, c, o_h, o_w)
+    return make_tensor(input_shape.batch_size, c, o_h, o_w)
 
 
 def shape_not_implemented(node):
@@ -36,7 +52,7 @@ def shape_identity(node):
 
 
 def shape_scalar(node):
-    return TensorShape(1, 1, 1, 1)
+    return make_tensor(1, 1, 1, 1)
 
 
 def shape_data(node):
@@ -59,7 +75,7 @@ def shape_data(node):
 
 def shape_mem_data(node):
     params = node.parameters
-    return TensorShape(params.batch_size, params.channels, params.height,
+    return make_tensor(params.batch_size, params.channels, params.height,
                        params.width)
 
 
@@ -79,10 +95,15 @@ def shape_convolution(node):
 
 
 def shape_pool(node):
-    return get_strided_kernel_output_shape(node, math.ceil)
+    ceil_mode = getattr(node.layer.parameters, 'ceil_mode', True)
+    if ceil_mode is True:
+        method = math.ceil
+    else:
+        method = math.floor
+
+    return get_strided_kernel_output_shape(node, method)
 
 
 def shape_inner_product(node):
     input_shape = node.get_only_parent().output_shape
-    return TensorShape(input_shape.batch_size, node.layer.parameters.num_output,
-                       1, 1)
+    return make_tensor(input_shape.batch_size, node.layer.parameters.num_output)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/transformers.py b/fluid/image_classification/caffe2fluid/kaffe/transformers.py
index 9d300ca9c90672c3f3a3dbf7a14e48db6bb48f70..6d98703da3313cf466eb43c2adc49c0e0640a8de 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/transformers.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/transformers.py
@@ -66,12 +66,14 @@ class DataInjector(object):
     def adjust_parameters(self, node, data):
         if not self.did_use_pb:
             return data
+
         # When using the protobuf-backend, each parameter initially has four dimensions.
         # In certain cases (like FC layers), we want to eliminate the singleton dimensions.
         # This implementation takes care of the common cases. However, it does leave the
         # potential for future issues.
         # The Caffe-backend does not suffer from this problem.
         data = list(data)
+
         squeeze_indices = [1]  # Squeeze biases.
         if node.kind == NodeKind.InnerProduct:
             squeeze_indices.append(0)  # Squeeze FC.
@@ -80,8 +82,22 @@ class DataInjector(object):
             if idx >= len(data):
                 continue
 
-            shape_old = data[idx].shape
-            data[idx] = np.squeeze(data[idx])
+            d = data[idx]
+            assert len(
+                d.shape
+            ) == 4, 'invalid shape[%s] from caffe when adjust_parameters' % (
+                str(d.shape))
+
+            shape_old = d.shape
+            sq_axis = None
+            if idx == 0:
+                sq_axis = (0, 1)
+            elif idx == 1:
+                sq_axis = (0, 1, 2)
+            else:
+                continue
+
+            data[idx] = np.squeeze(d, axis=sq_axis)
             shape_new = data[idx].shape
             if len(shape_old) != shape_new:
                 debug('squeeze idx:%d, with kind:%s,name:%s' % \
@@ -113,7 +129,10 @@ class DataReshaper(object):
         try:
             parent = node.get_only_parent()
             s = parent.output_shape
-            return s.height > 1 or s.width > 1
+            if len(s) == 4:
+                return s.height > 1 or s.width > 1
+            else:
+                return False
         except KaffeError:
             return False
 
@@ -121,25 +140,26 @@ class DataReshaper(object):
         try:
             return self.mapping[node_kind]
         except KeyError:
-            raise
-            #raise KaffeError('Ordering not found for node kind: {}'.format(node_kind))
+            raise KaffeError('Ordering not found for node kind: {}'.format(
+                node_kind))
 
     def __call__(self, graph):
         for node in graph.nodes:
             if node.data is None:
                 continue
+
             if node.kind not in self.reshaped_node_types:
                 # Check for 2+ dimensional data
                 if any(len(tensor.shape) > 1 for tensor in node.data):
                     notice('parmaters not reshaped for node: {}'.format(node))
                 continue
+
             transpose_order = self.map(node.kind)
             weights = node.data[0]
-            if (node.kind == NodeKind.InnerProduct
-                ) and self.has_spatial_parent(node):
+            if node.kind == NodeKind.InnerProduct:
                 # The FC layer connected to the spatial layer needs to be
                 # re-wired to match the new spatial ordering.
-                in_shape = node.get_only_parent().output_shape
+                #in_shape = node.get_only_parent().output_shape
                 fc_shape = weights.shape
                 output_channels = fc_shape[0]
                 weights = weights.reshape((output_channels, -1))
@@ -178,7 +198,8 @@ class SubNodeFuser(object):
                 continue
             # Rewrite the fused node's children to its parent.
             for child in node.children:
-                child.parents.remove(node)
+                pos = child.parents.index(node)
+                child.parents[pos] = parent
                 parent.add_child(child)
             # Disconnect the fused node from the graph.
             parent.children.remove(node)
diff --git a/fluid/image_classification/caffe2fluid/proto/compile.sh b/fluid/image_classification/caffe2fluid/proto/compile.sh
old mode 100644
new mode 100755
diff --git a/fluid/image_classification/se_resnext.py b/fluid/image_classification/se_resnext.py
index c2b2d680fc995b1ea6cc5a2f640746a8a79ac029..573c6bec5bdc3c08e9503e46f6e09fad2cb09707 100644
--- a/fluid/image_classification/se_resnext.py
+++ b/fluid/image_classification/se_resnext.py
@@ -1,7 +1,5 @@
-import os
 import paddle.v2 as paddle
 import paddle.fluid as fluid
-import reader
 
 
 def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
@@ -65,20 +63,44 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
     return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
 
 
-def SE_ResNeXt(input, class_dim, infer=False):
-    cardinality = 64
-    reduction_ratio = 16
-    depth = [3, 8, 36, 3]
-    num_filters = [128, 256, 512, 1024]
-
-    conv = conv_bn_layer(
-        input=input, num_filters=64, filter_size=3, stride=2, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
-    conv = fluid.layers.pool2d(
-        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+def SE_ResNeXt(input, class_dim, infer=False, layers=50):
+    supported_layers = [50, 152]
+    if layers not in supported_layers:
+        print("supported layers are", supported_layers, "but input layer is",
+              layers)
+        exit()
+    if layers == 50:
+        cardinality = 32
+        reduction_ratio = 16
+        depth = [3, 4, 6, 3]
+        num_filters = [128, 256, 512, 1024]
+
+        conv = conv_bn_layer(
+            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+    elif layers == 152:
+        cardinality = 64
+        reduction_ratio = 16
+        depth = [3, 8, 36, 3]
+        num_filters = [128, 256, 512, 1024]
+
+        conv = conv_bn_layer(
+            input=input, num_filters=64, filter_size=3, stride=2, act='relu')
+        conv = conv_bn_layer(
+            input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+        conv = conv_bn_layer(
+            input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
 
     for block in range(len(depth)):
         for i in range(depth[block]):
@@ -97,93 +119,3 @@ def SE_ResNeXt(input, class_dim, infer=False):
         drop = pool
     out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
     return out
-
-
-def train(learning_rate,
-          batch_size,
-          num_passes,
-          init_model=None,
-          model_save_dir='model',
-          parallel=True):
-    class_dim = 1000
-    image_shape = [3, 224, 224]
-
-    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if parallel:
-        places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places)
-
-        with pd.do():
-            image_ = pd.read_input(image)
-            label_ = pd.read_input(label)
-            out = SE_ResNeXt(input=image_, class_dim=class_dim)
-            cost = fluid.layers.cross_entropy(input=out, label=label_)
-            avg_cost = fluid.layers.mean(x=cost)
-            accuracy = fluid.layers.accuracy(input=out, label=label_)
-            pd.write_output(avg_cost)
-            pd.write_output(accuracy)
-
-        avg_cost, accuracy = pd()
-        avg_cost = fluid.layers.mean(x=avg_cost)
-        accuracy = fluid.layers.mean(x=accuracy)
-    else:
-        out = SE_ResNeXt(input=image, class_dim=class_dim)
-        cost = fluid.layers.cross_entropy(input=out, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-        accuracy = fluid.layers.accuracy(input=out, label=label)
-
-    optimizer = fluid.optimizer.Momentum(
-        learning_rate=learning_rate,
-        momentum=0.9,
-        regularization=fluid.regularizer.L2Decay(1e-4))
-    opts = optimizer.minimize(avg_cost)
-
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program([avg_cost, accuracy])
-
-    place = fluid.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-
-    if init_model is not None:
-        fluid.io.load_persistables(exe, init_model)
-
-    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
-    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
-    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
-
-    for pass_id in range(num_passes):
-        for batch_id, data in enumerate(train_reader()):
-            loss = exe.run(fluid.default_main_program(),
-                           feed=feeder.feed(data),
-                           fetch_list=[avg_cost])
-            print("Pass {0}, batch {1}, loss {2}".format(pass_id, batch_id,
-                                                         float(loss[0])))
-
-        total_loss = 0.0
-        total_acc = 0.0
-        total_batch = 0
-        for data in test_reader():
-            loss, acc = exe.run(inference_program,
-                                feed=feeder.feed(data),
-                                fetch_list=[avg_cost, accuracy])
-            total_loss += float(loss)
-            total_acc += float(acc)
-            total_batch += 1
-        print("End pass {0}, test_loss {1}, test_acc {2}".format(
-            pass_id, total_loss / total_batch, total_acc / total_batch))
-
-        model_path = os.path.join(model_save_dir, str(pass_id))
-        fluid.io.save_inference_model(model_path, ['image'], [out], exe)
-
-
-if __name__ == '__main__':
-    train(
-        learning_rate=0.1,
-        batch_size=8,
-        num_passes=100,
-        init_model=None,
-        parallel=False)
diff --git a/fluid/image_classification/train.py b/fluid/image_classification/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..6244e520900b58914b847c4acb451beb252efd30
--- /dev/null
+++ b/fluid/image_classification/train.py
@@ -0,0 +1,307 @@
+import os
+import numpy as np
+import time
+import sys
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+from se_resnext import SE_ResNeXt
+import reader
+
+import argparse
+import functools
+from utility import add_arguments, print_arguments
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',   int,  256, "Minibatch size.")
+add_arg('num_layers',   int,  50,  "How many layers for SE-ResNeXt model.")
+add_arg('with_mem_opt', bool, True, "Whether to use memory optimization or not.")
+add_arg('parallel_exe', bool, True, "Whether to use ParallelExecutor to train or not.")
+# yapf: enable
+
+
+def train_parallel_do(args,
+                      learning_rate,
+                      batch_size,
+                      num_passes,
+                      init_model=None,
+                      model_save_dir='model',
+                      parallel=True,
+                      use_nccl=True,
+                      lr_strategy=None,
+                      layers=50):
+    class_dim = 1000
+    image_shape = [3, 224, 224]
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if parallel:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
+
+        with pd.do():
+            image_ = pd.read_input(image)
+            label_ = pd.read_input(label)
+            out = SE_ResNeXt(input=image_, class_dim=class_dim, layers=layers)
+            cost = fluid.layers.cross_entropy(input=out, label=label_)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1)
+            acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5)
+            pd.write_output(avg_cost)
+            pd.write_output(acc_top1)
+            pd.write_output(acc_top5)
+
+        avg_cost, acc_top1, acc_top5 = pd()
+        avg_cost = fluid.layers.mean(x=avg_cost)
+        acc_top1 = fluid.layers.mean(x=acc_top1)
+        acc_top5 = fluid.layers.mean(x=acc_top5)
+    else:
+        out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+    inference_program = fluid.default_main_program().clone(for_test=True)
+
+    if lr_strategy is None:
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+    else:
+        bd = lr_strategy["bd"]
+        lr = lr_strategy["lr"]
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+
+    opts = optimizer.minimize(avg_cost)
+    if args.with_mem_opt:
+        fluid.memory_optimize(fluid.default_main_program())
+
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if init_model is not None:
+        fluid.io.load_persistables(exe, init_model)
+
+    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
+    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
+    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
+
+    for pass_id in range(num_passes):
+        train_info = [[], [], []]
+        test_info = [[], [], []]
+        for batch_id, data in enumerate(train_reader()):
+            t1 = time.time()
+            loss, acc1, acc5 = exe.run(
+                fluid.default_main_program(),
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost, acc_top1, acc_top5])
+            t2 = time.time()
+            period = t2 - t1
+            train_info[0].append(loss[0])
+            train_info[1].append(acc1[0])
+            train_info[2].append(acc5[0])
+            if batch_id % 10 == 0:
+                print("Pass {0}, trainbatch {1}, loss {2}, \
+                       acc1 {3}, acc5 {4} time {5}"
+                                                   .format(pass_id, \
+                       batch_id, loss[0], acc1[0], acc5[0], \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        train_loss = np.array(train_info[0]).mean()
+        train_acc1 = np.array(train_info[1]).mean()
+        train_acc5 = np.array(train_info[2]).mean()
+        for data in test_reader():
+            t1 = time.time()
+            loss, acc1, acc5 = exe.run(
+                inference_program,
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost, acc_top1, acc_top5])
+            t2 = time.time()
+            period = t2 - t1
+            test_info[0].append(loss[0])
+            test_info[1].append(acc1[0])
+            test_info[2].append(acc5[0])
+            if batch_id % 10 == 0:
+                print("Pass {0},testbatch {1},loss {2}, \
+                       acc1 {3},acc5 {4},time {5}"
+                                                  .format(pass_id, \
+                       batch_id, loss[0], acc1[0], acc5[0], \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        test_loss = np.array(test_info[0]).mean()
+        test_acc1 = np.array(test_info[1]).mean()
+        test_acc5 = np.array(test_info[2]).mean()
+
+        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
+               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
+                                                           .format(pass_id, \
+              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
+              test_acc5))
+        sys.stdout.flush()
+
+        model_path = os.path.join(model_save_dir, str(pass_id))
+        if not os.path.isdir(model_path):
+            os.makedirs(model_path)
+        fluid.io.save_persistables(exe, model_path)
+
+
+def train_parallel_exe(args,
+                       learning_rate,
+                       batch_size,
+                       num_passes,
+                       init_model=None,
+                       model_save_dir='model',
+                       parallel=True,
+                       use_nccl=True,
+                       lr_strategy=None,
+                       layers=50):
+    class_dim = 1000
+    image_shape = [3, 224, 224]
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    test_program = fluid.default_main_program().clone(for_test=True)
+
+    if lr_strategy is None:
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+    else:
+        bd = lr_strategy["bd"]
+        lr = lr_strategy["lr"]
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+
+    opts = optimizer.minimize(avg_cost)
+
+    if args.with_mem_opt:
+        fluid.memory_optimize(fluid.default_main_program())
+
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if init_model is not None:
+        fluid.io.load_persistables(exe, init_model)
+
+    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
+    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
+    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
+
+    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
+    test_exe = fluid.ParallelExecutor(
+        use_cuda=True, main_program=test_program, share_vars_from=train_exe)
+
+    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
+
+    for pass_id in range(num_passes):
+        train_info = [[], [], []]
+        test_info = [[], [], []]
+        for batch_id, data in enumerate(train_reader()):
+            t1 = time.time()
+            loss, acc1, acc5 = train_exe.run(fetch_list,
+                                             feed_dict=feeder.feed(data))
+            t2 = time.time()
+            period = t2 - t1
+            loss = np.mean(np.array(loss))
+            acc1 = np.mean(np.array(acc1))
+            acc5 = np.mean(np.array(acc5))
+            train_info[0].append(loss)
+            train_info[1].append(acc1)
+            train_info[2].append(acc5)
+            if batch_id % 10 == 0:
+                print("Pass {0}, trainbatch {1}, loss {2}, \
+                       acc1 {3}, acc5 {4} time {5}"
+                                                   .format(pass_id, \
+                       batch_id, loss, acc1, acc5, \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        train_loss = np.array(train_info[0]).mean()
+        train_acc1 = np.array(train_info[1]).mean()
+        train_acc5 = np.array(train_info[2]).mean()
+        for data in test_reader():
+            t1 = time.time()
+            loss, acc1, acc5 = test_exe.run(fetch_list,
+                                            feed_dict=feeder.feed(data))
+            t2 = time.time()
+            period = t2 - t1
+            loss = np.mean(np.array(loss))
+            acc1 = np.mean(np.array(acc1))
+            acc5 = np.mean(np.array(acc5))
+            test_info[0].append(loss)
+            test_info[1].append(acc1)
+            test_info[2].append(acc5)
+            if batch_id % 10 == 0:
+                print("Pass {0},testbatch {1},loss {2}, \
+                       acc1 {3},acc5 {4},time {5}"
+                                                  .format(pass_id, \
+                       batch_id, loss, acc1, acc5, \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        test_loss = np.array(test_info[0]).mean()
+        test_acc1 = np.array(test_info[1]).mean()
+        test_acc5 = np.array(test_info[2]).mean()
+
+        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
+               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
+                                                           .format(pass_id, \
+              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
+              test_acc5))
+        sys.stdout.flush()
+
+        model_path = os.path.join(model_save_dir, str(pass_id))
+        if not os.path.isdir(model_path):
+            os.makedirs(model_path)
+        fluid.io.save_persistables(exe, model_path)
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+
+    epoch_points = [30, 60, 90]
+    total_images = 1281167
+    batch_size = args.batch_size
+    step = int(total_images / batch_size + 1)
+    bd = [e * step for e in epoch_points]
+    lr = [0.1, 0.01, 0.001, 0.0001]
+
+    lr_strategy = {"bd": bd, "lr": lr}
+
+    use_nccl = True
+    # layers: 50, 152
+    layers = args.num_layers
+    method = train_parallel_exe if args.parallel_exe else train_parallel_do
+    method(
+        args,
+        learning_rate=0.1,
+        batch_size=batch_size,
+        num_passes=120,
+        init_model=None,
+        parallel=True,
+        use_nccl=True,
+        lr_strategy=lr_strategy,
+        layers=layers)
diff --git a/fluid/image_classification/utility.py b/fluid/image_classification/utility.py
new file mode 100644
index 0000000000000000000000000000000000000000..506e6007ceb9059caf1163befb6ff594d67b547a
--- /dev/null
+++ b/fluid/image_classification/utility.py
@@ -0,0 +1,62 @@
+"""Contains common utility functions."""
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import distutils.util
+import numpy as np
+from paddle.fluid import core
+
+
+def print_arguments(args):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).iteritems()):
+        print("%s: %s" % (arg, value))
+    print("------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
diff --git a/fluid/language_model/README.md b/fluid/language_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..91ce2d7f58085b56da2ac2dec03af2a05985ab8f
--- /dev/null
+++ b/fluid/language_model/README.md
@@ -0,0 +1,148 @@
+# 语言模型
+
+以下是本例的简要目录结构及说明：
+
+```text
+.
+├── README.md            # 文档
+├── train.py             # 训练脚本
+├── infer.py             # 预测脚本
+└── utils.py             # 通用函数
+```
+
+
+## 简介
+
+循环神经网络语言模型的介绍可以参阅论文[Recurrent Neural Network Regularization](https://arxiv.org/abs/1409.2329)，在本例中，我们实现了GRU-RNN语言模型。
+
+## 训练
+
+运行命令 `python train.py` 开始训练模型。
+```python
+python train.py
+```
+
+当前支持的参数可参见[train.py](./train.py) `train_net` 函数
+```python
+vocab, train_reader, test_reader = utils.prepare_data(
+        batch_size=20, # batch size
+        buffer_size=1000, # buffer size, default value is OK
+        word_freq_threshold=0) # vocabulary related parameter, and words with frequency below this value will be filtered
+
+train(train_reader=train_reader,
+        vocab=vocab,
+        network=network,
+        hid_size=200, # embedding and hidden size
+        base_lr=1.0, # base learning rate
+        batch_size=20, # batch size, the same as that in prepare_data
+        pass_num=12, # the number of passes for training
+        use_cuda=True, # whether to use GPU card
+        parallel=False, # whether to be parallel
+        model_dir="model", # directory to save model
+        init_low_bound=-0.1, # uniform parameter initialization lower bound
+        init_high_bound=0.1) # uniform parameter initialization upper bound
+```
+
+## 自定义网络结构
+
+可在[train.py](./train.py) `network` 函数中调整网络结构，当前的网络结构如下：
+```python
+emb = fluid.layers.embedding(input=src, size=[vocab_size, hid_size],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound),
+            learning_rate=emb_lr_x),
+        is_sparse=True)
+
+fc0 = fluid.layers.fc(input=emb, size=hid_size * 3,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound),
+            learning_rate=gru_lr_x))
+gru_h0 = fluid.layers.dynamic_gru(input=fc0, size=hid_size,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound),
+            learning_rate=gru_lr_x))
+
+fc = fluid.layers.fc(input=gru_h0, size=vocab_size, act='softmax',
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound),
+            learning_rate=fc_lr_x))
+
+cost = fluid.layers.cross_entropy(input=fc, label=dst)
+```
+
+## 训练结果示例
+
+我们在Tesla K40m单GPU卡上训练的日志如下所示
+```text
+epoch_1 start
+step:100 ppl:771.053
+step:200 ppl:449.597
+step:300 ppl:642.654
+step:400 ppl:458.128
+step:500 ppl:510.912
+step:600 ppl:451.545
+step:700 ppl:364.404
+step:800 ppl:324.272
+step:900 ppl:360.797
+step:1000 ppl:275.761
+step:1100 ppl:294.599
+step:1200 ppl:335.877
+step:1300 ppl:185.262
+step:1400 ppl:241.744
+step:1500 ppl:211.507
+step:1600 ppl:233.431
+step:1700 ppl:298.767
+step:1800 ppl:203.403
+step:1900 ppl:158.828
+step:2000 ppl:171.148
+step:2100 ppl:280.884
+epoch:1 num_steps:2104 time_cost(s):47.478780
+model saved in model/epoch_1
+epoch_2 start
+step:100 ppl:238.099
+step:200 ppl:136.527
+step:300 ppl:204.184
+step:400 ppl:252.886
+step:500 ppl:177.377
+step:600 ppl:197.688
+step:700 ppl:131.650
+step:800 ppl:223.906
+step:900 ppl:144.785
+step:1000 ppl:176.286
+step:1100 ppl:148.158
+step:1200 ppl:203.581
+step:1300 ppl:168.208
+step:1400 ppl:159.412
+step:1500 ppl:114.032
+step:1600 ppl:157.985
+step:1700 ppl:147.743
+step:1800 ppl:88.676
+step:1900 ppl:141.962
+step:2000 ppl:106.087
+step:2100 ppl:122.709
+epoch:2 num_steps:2104 time_cost(s):47.583789
+model saved in model/epoch_2
+...
+```
+
+## 预测
+运行命令 `python infer.py model_dir start_epoch last_epoch(inclusive)` 开始预测，其中，start_epoch指定开始预测的轮次，last_epoch指定结束的轮次，例如
+```python
+python infer.py model 1 12 # prediction from epoch 1 to epoch 12
+```
+
+## 预测结果示例
+```text
+model:model/epoch_1 ppl:254.540 time_cost(s):3.29
+model:model/epoch_2 ppl:177.671 time_cost(s):3.27
+model:model/epoch_3 ppl:156.251 time_cost(s):3.27
+model:model/epoch_4 ppl:139.036 time_cost(s):3.27
+model:model/epoch_5 ppl:132.661 time_cost(s):3.27
+model:model/epoch_6 ppl:130.092 time_cost(s):3.28
+model:model/epoch_7 ppl:128.751 time_cost(s):3.27
+model:model/epoch_8 ppl:125.411 time_cost(s):3.27
+model:model/epoch_9 ppl:124.604 time_cost(s):3.28
+model:model/epoch_10 ppl:124.754 time_cost(s):3.29
+model:model/epoch_11 ppl:125.421 time_cost(s):3.27
+model:model/epoch_12 ppl:125.676 time_cost(s):3.27
+```
diff --git a/fluid/language_model/infer.py b/fluid/language_model/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a183d54852dc9d56b76968a5f450479a43325304
--- /dev/null
+++ b/fluid/language_model/infer.py
@@ -0,0 +1,65 @@
+import sys
+import time
+import math
+import unittest
+import contextlib
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+import utils
+
+
+def infer(test_reader, use_cuda, model_path):
+    """ inference function """
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    with fluid.scope_guard(fluid.core.Scope()):
+        infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(
+            model_path, exe)
+
+        accum_cost = 0.0
+        accum_words = 0
+        t0 = time.time()
+        for data in test_reader():
+            src_wordseq = utils.to_lodtensor(map(lambda x: x[0], data), place)
+            dst_wordseq = utils.to_lodtensor(map(lambda x: x[1], data), place)
+            avg_cost = exe.run(
+                infer_program,
+                feed={"src_wordseq": src_wordseq,
+                      "dst_wordseq": dst_wordseq},
+                fetch_list=fetch_vars)
+
+            nwords = src_wordseq.lod()[0][-1]
+
+            cost = np.array(avg_cost) * nwords
+            accum_cost += cost
+            accum_words += nwords
+
+        ppl = math.exp(accum_cost / accum_words)
+        t1 = time.time()
+        print("model:%s ppl:%.3f time_cost(s):%.2f" %
+              (model_path, ppl, t1 - t0))
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("Usage: %s model_dir start_epoch last_epoch(inclusive)")
+        exit(0)
+
+    model_dir = sys.argv[1]
+    try:
+        start_index = int(sys.argv[2])
+        last_index = int(sys.argv[3])
+    except:
+        print("Usage: %s model_dir start_epoch last_epoch(inclusive)")
+        exit(-1)
+
+    vocab, train_reader, test_reader = utils.prepare_data(
+        batch_size=20, buffer_size=1000, word_freq_threshold=0)
+
+    for epoch in xrange(start_index, last_index + 1):
+        epoch_path = model_dir + "/epoch_" + str(epoch)
+        infer(test_reader=test_reader, use_cuda=True, model_path=epoch_path)
diff --git a/fluid/language_model/train.py b/fluid/language_model/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..59fc3a987746af7aec9b61b5c817400b6b6546d0
--- /dev/null
+++ b/fluid/language_model/train.py
@@ -0,0 +1,162 @@
+import sys
+import time
+
+import numpy as np
+import math
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+import utils
+
+
+def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
+    """ network definition """
+    emb_lr_x = 10.0
+    gru_lr_x = 1.0
+    fc_lr_x = 1.0
+    emb = fluid.layers.embedding(
+        input=src,
+        size=[vocab_size, hid_size],
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(
+                low=init_low_bound, high=init_high_bound),
+            learning_rate=emb_lr_x),
+        is_sparse=True)
+
+    fc0 = fluid.layers.fc(input=emb,
+                          size=hid_size * 3,
+                          param_attr=fluid.ParamAttr(
+                              initializer=fluid.initializer.Uniform(
+                                  low=init_low_bound, high=init_high_bound),
+                              learning_rate=gru_lr_x))
+    gru_h0 = fluid.layers.dynamic_gru(
+        input=fc0,
+        size=hid_size,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(
+                low=init_low_bound, high=init_high_bound),
+            learning_rate=gru_lr_x))
+
+    fc = fluid.layers.fc(input=gru_h0,
+                         size=vocab_size,
+                         act='softmax',
+                         param_attr=fluid.ParamAttr(
+                             initializer=fluid.initializer.Uniform(
+                                 low=init_low_bound, high=init_high_bound),
+                             learning_rate=fc_lr_x))
+
+    cost = fluid.layers.cross_entropy(input=fc, label=dst)
+    return cost
+
+
+def train(train_reader,
+          vocab,
+          network,
+          hid_size,
+          base_lr,
+          batch_size,
+          pass_num,
+          use_cuda,
+          parallel,
+          model_dir,
+          init_low_bound=-0.04,
+          init_high_bound=0.04):
+    """ train network """
+    vocab_size = len(vocab)
+
+    src_wordseq = fluid.layers.data(
+        name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
+    dst_wordseq = fluid.layers.data(
+        name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)
+
+    avg_cost = None
+    if not parallel:
+        cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
+                       init_low_bound, init_high_bound)
+        avg_cost = fluid.layers.mean(x=cost)
+    else:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            cost = network(
+                pd.read_input(src_wordseq),
+                pd.read_input(dst_wordseq), vocab_size, hid_size,
+                init_low_bound, init_high_bound)
+            pd.write_output(cost)
+
+        cost = pd()
+        avg_cost = fluid.layers.mean(x=cost)
+
+    sgd_optimizer = fluid.optimizer.SGD(
+        learning_rate=fluid.layers.exponential_decay(
+            learning_rate=base_lr,
+            decay_steps=2100 * 4,
+            decay_rate=0.5,
+            staircase=True))
+    sgd_optimizer.minimize(avg_cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    exe.run(fluid.default_startup_program())
+    total_time = 0.0
+    for pass_idx in xrange(pass_num):
+        epoch_idx = pass_idx + 1
+        print "epoch_%d start" % epoch_idx
+
+        t0 = time.time()
+        i = 0
+        for data in train_reader():
+            i += 1
+            lod_src_wordseq = utils.to_lodtensor(
+                map(lambda x: x[0], data), place)
+            lod_dst_wordseq = utils.to_lodtensor(
+                map(lambda x: x[1], data), place)
+            ret_avg_cost = exe.run(fluid.default_main_program(),
+                                   feed={
+                                       "src_wordseq": lod_src_wordseq,
+                                       "dst_wordseq": lod_dst_wordseq
+                                   },
+                                   fetch_list=[avg_cost],
+                                   use_program_cache=True)
+            avg_ppl = math.exp(ret_avg_cost[0])
+            if i % 100 == 0:
+                print "step:%d ppl:%.3f" % (i, avg_ppl)
+
+        t1 = time.time()
+        total_time += t1 - t0
+        print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i,
+                                                         total_time / epoch_idx)
+
+        save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
+        feed_var_names = ["src_wordseq", "dst_wordseq"]
+        fetch_vars = [avg_cost]
+        fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
+        print("model saved in %s" % save_dir)
+
+    print("finish training")
+
+
+def train_net():
+    """ do training """
+    batch_size = 20
+    vocab, train_reader, test_reader = utils.prepare_data(
+        batch_size=batch_size, buffer_size=1000, word_freq_threshold=0)
+    train(
+        train_reader=train_reader,
+        vocab=vocab,
+        network=network,
+        hid_size=200,
+        base_lr=1.0,
+        batch_size=batch_size,
+        pass_num=12,
+        use_cuda=True,
+        parallel=False,
+        model_dir="model",
+        init_low_bound=-0.1,
+        init_high_bound=0.1)
+
+
+if __name__ == "__main__":
+    train_net()
diff --git a/fluid/language_model/utils.py b/fluid/language_model/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5909046176586556a2aedba5dd5d12810b3ea8d
--- /dev/null
+++ b/fluid/language_model/utils.py
@@ -0,0 +1,40 @@
+import sys
+import time
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+
+def to_lodtensor(data, place):
+    """ convert to LODtensor """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0):
+    """ prepare the English Pann Treebank (PTB) data """
+    vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold)
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imikolov.train(
+                vocab,
+                buffer_size,
+                data_type=paddle.dataset.imikolov.DataType.SEQ),
+            buf_size=buffer_size),
+        batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.imikolov.test(
+            vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ),
+        batch_size)
+    return vocab, train_reader, test_reader
diff --git a/fluid/neural_machine_translation/transformer/config.py b/fluid/neural_machine_translation/transformer/config.py
index 71e4314953383b8f89b40fdfd8cc4274f954fed1..c83b655f93ed80b8a599cb7fc789cf309d7d608b 100644
--- a/fluid/neural_machine_translation/transformer/config.py
+++ b/fluid/neural_machine_translation/transformer/config.py
@@ -15,6 +15,9 @@ class TrainTaskConfig(object):
     # the parameters for learning rate scheduling.
     warmup_steps = 4000
 
+    # the flag indicating to use average loss or sum loss when training.
+    use_avg_cost = False
+
     # the directory for saving trained models.
     model_dir = "trained_models"
 
@@ -22,8 +25,7 @@ class TrainTaskConfig(object):
 class InferTaskConfig(object):
     use_gpu = False
     # the number of examples in one run for sequence generation.
-    # currently the batch size can only be set to 1.
-    batch_size = 1
+    batch_size = 10
 
     # the parameters for beam search.
     beam_size = 5
@@ -31,37 +33,38 @@ class InferTaskConfig(object):
     # the number of decoded sentences to output.
     n_best = 1
 
+    # the flags indicating whether to output the special tokens.
+    output_bos = False
+    output_eos = False
+    output_unk = False
+
     # the directory for loading the trained model.
     model_path = "trained_models/pass_1.infer.model"
 
 
 class ModelHyperParams(object):
-    # Dictionary size for source and target language. This model directly uses
-    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
-    # alreay been added, but the <pad> token is not added. Transformer requires
-    # sequences in a mini-batch are padded to have the same length. A <pad> token is
-    # added into the original dictionary in paddle.dateset.wmt16.
+    # This model directly uses paddle.dataset.wmt16 in which <bos>, <eos> and
+    # <unk> token has alreay been added. As for the <pad> token, any token
+    # included in dict can be used to pad, since the paddings' loss will be
+    # masked out and make no effect on parameter gradients.
 
     # size of source word dictionary.
     src_vocab_size = 10000
-    # index for <pad> token in source language.
-    src_pad_idx = src_vocab_size
 
     # size of target word dictionay
     trg_vocab_size = 10000
-    # index for <pad> token in target language.
-    trg_pad_idx = trg_vocab_size
 
     # index for <bos> token
     bos_idx = 0
     # index for <eos> token
     eos_idx = 1
+    # index for <unk> token
+    unk_idx = 2
 
-    # position value corresponding to the <pad> token.
-    pos_pad_idx = 0
-
-    # max length of sequences. It should plus 1 to include position
-    # padding token for position encoding.
+    # max length of sequences.
+    # The size of position encoding table should at least plus 1, since the
+    # sinusoid position encoding starts from 1 and 0 can be used as the padding
+    # token for position encoding.
     max_length = 50
 
     # the dimension for word embeddings, which is also the last dimension of
@@ -92,7 +95,10 @@ pos_enc_param_names = (
 encoder_input_data_names = (
     "src_word",
     "src_pos",
-    "src_slf_attn_bias", )
+    "src_slf_attn_bias",
+    "src_data_shape",
+    "src_slf_attn_pre_softmax_shape",
+    "src_slf_attn_post_softmax_shape", )
 
 # Names of all data layers in decoder listed in order.
 decoder_input_data_names = (
@@ -100,6 +106,11 @@ decoder_input_data_names = (
     "trg_pos",
     "trg_slf_attn_bias",
     "trg_src_attn_bias",
+    "trg_data_shape",
+    "trg_slf_attn_pre_softmax_shape",
+    "trg_slf_attn_post_softmax_shape",
+    "trg_src_attn_pre_softmax_shape",
+    "trg_src_attn_post_softmax_shape",
     "enc_output", )
 
 # Names of label related data layers listed in order.
diff --git a/fluid/neural_machine_translation/transformer/infer.py b/fluid/neural_machine_translation/transformer/infer.py
index e4dee220cedf856633ee626b762804e49a10cfe8..ad7fc2fa39db15698842aae26c80d86f7592775b 100644
--- a/fluid/neural_machine_translation/transformer/infer.py
+++ b/fluid/neural_machine_translation/transformer/infer.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 import model
@@ -11,10 +11,26 @@ from config import InferTaskConfig, ModelHyperParams, \
 from train import pad_batch_data
 
 
-def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
-                    decoder, dec_in_names, dec_out_names, beam_size, max_length,
-                    n_best, batch_size, n_head, src_pad_idx, trg_pad_idx,
-                    bos_idx, eos_idx):
+def translate_batch(exe,
+                    src_words,
+                    encoder,
+                    enc_in_names,
+                    enc_out_names,
+                    decoder,
+                    dec_in_names,
+                    dec_out_names,
+                    beam_size,
+                    max_length,
+                    n_best,
+                    batch_size,
+                    n_head,
+                    d_model,
+                    src_pad_idx,
+                    trg_pad_idx,
+                    bos_idx,
+                    eos_idx,
+                    unk_idx,
+                    output_unk=True):
     """
     Run the encoder program once and run the decoder program multiple times to
     implement beam search externally.
@@ -25,9 +41,21 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
         src_pad_idx,
         n_head,
         is_target=False,
-        return_pos=True,
+        is_label=False,
         return_attn_bias=True,
-        return_max_len=True)
+        return_max_len=False)
+    # Append the data shape input to reshape the output of embedding layer.
+    enc_in_data = enc_in_data + [
+        np.array(
+            [-1, enc_in_data[2].shape[-1], d_model], dtype="int32")
+    ]
+    # Append the shape inputs to reshape before and after softmax in encoder
+    # self attention.
+    enc_in_data = enc_in_data + [
+        np.array(
+            [-1, enc_in_data[2].shape[-1]], dtype="int32"), np.array(
+                enc_in_data[2].shape, dtype="int32")
+    ]
     enc_output = exe.run(encoder,
                          feed=dict(zip(enc_in_names, enc_in_data)),
                          fetch_list=enc_out_names)[0]
@@ -35,13 +63,18 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
     # Beam Search.
     # To store the beam info.
     scores = np.zeros((batch_size, beam_size), dtype="float32")
-    prev_branchs = [[]] * batch_size
-    next_ids = [[]] * batch_size
-    # Use beam_map to map the instance idx in batch to beam idx, since the
+    prev_branchs = [[] for i in range(batch_size)]
+    next_ids = [[] for i in range(batch_size)]
+    # Use beam_inst_map to map beam idx to the instance idx in batch, since the
     # size of feeded batch is changing.
-    beam_map = range(batch_size)
+    beam_inst_map = {
+        beam_idx: inst_idx
+        for inst_idx, beam_idx in enumerate(range(batch_size))
+    }
+    # Use active_beams to recode the alive.
+    active_beams = range(batch_size)
 
-    def beam_backtrace(prev_branchs, next_ids, n_best=beam_size, add_bos=True):
+    def beam_backtrace(prev_branchs, next_ids, n_best=beam_size):
         """
         Decode and select n_best sequences for one instance by backtrace.
         """
@@ -53,7 +86,8 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
                 seq.append(next_ids[j][k])
                 k = prev_branchs[j][k]
             seq = seq[::-1]
-            seq = [bos_idx] + seq if add_bos else seq
+            # Add the <bos>, since next_ids don't include the <bos>.
+            seq = [bos_idx] + seq
             seqs.append(seq)
         return seqs
 
@@ -64,8 +98,8 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
         trg_words = np.array(
             [[bos_idx]] * batch_size * beam_size, dtype="int64")
         trg_pos = np.array([[1]] * batch_size * beam_size, dtype="int64")
-        src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[
-            -1], enc_in_data[-2], 1
+        src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[2].shape[
+            -1], enc_in_data[2], 1
         # This is used to remove attention on subsequent words.
         trg_slf_attn_bias = np.ones((batch_size * beam_size, trg_max_len,
                                      trg_max_len))
@@ -75,22 +109,47 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
                              [-1e9]).astype("float32")
         # This is used to remove attention on the paddings of source sequences.
         trg_src_attn_bias = np.tile(
-            src_slf_attn_bias[:, :, ::src_max_length, :],
-            [beam_size, 1, trg_max_len, 1])
-        enc_output = np.tile(enc_output, [beam_size, 1, 1])
-        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output
+            src_slf_attn_bias[:, :, ::src_max_length, :][:, np.newaxis],
+            [1, beam_size, 1, trg_max_len, 1]).reshape([
+                -1, src_slf_attn_bias.shape[1], trg_max_len,
+                src_slf_attn_bias.shape[-1]
+            ])
+        # Append the shape input to reshape the output of embedding layer.
+        trg_data_shape = np.array(
+            [batch_size * beam_size, trg_max_len, d_model], dtype="int32")
+        # Append the shape inputs to reshape before and after softmax in
+        # decoder self attention.
+        trg_slf_attn_pre_softmax_shape = np.array(
+            [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
+        trg_slf_attn_post_softmax_shape = np.array(
+            trg_slf_attn_bias.shape, dtype="int32")
+        # Append the shape inputs to reshape before and after softmax in
+        # encoder-decoder attention.
+        trg_src_attn_pre_softmax_shape = np.array(
+            [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
+        trg_src_attn_post_softmax_shape = np.array(
+            trg_src_attn_bias.shape, dtype="int32")
+        enc_output = np.tile(
+            enc_output[:, np.newaxis], [1, beam_size, 1, 1]).reshape(
+                [-1, enc_output.shape[-2], enc_output.shape[-1]])
+        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
+            trg_src_attn_post_softmax_shape, enc_output
 
-    def update_dec_in_data(dec_in_data, next_ids, active_beams):
+    def update_dec_in_data(dec_in_data, next_ids, active_beams, beam_inst_map):
         """
         Update the input data of decoder mainly by slicing from the previous
         input data and dropping the finished instance beams.
         """
-        trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = dec_in_data
-        trg_cur_len = len(next_ids[0]) + 1  # include the <bos>
+        trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
+            trg_src_attn_post_softmax_shape, enc_output = dec_in_data
+        trg_cur_len = trg_slf_attn_bias.shape[-1] + 1
         trg_words = np.array(
             [
-                beam_backtrace(
-                    prev_branchs[beam_idx], next_ids[beam_idx], add_bos=True)
+                beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx])
                 for beam_idx in active_beams
             ],
             dtype="int64")
@@ -98,6 +157,7 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
         trg_pos = np.array(
             [range(1, trg_cur_len + 1)] * len(active_beams) * beam_size,
             dtype="int64").reshape([-1, 1])
+        active_beams = [beam_inst_map[beam_idx] for beam_idx in active_beams]
         active_beams_indice = (
             (np.array(active_beams) * beam_size)[:, np.newaxis] +
             np.array(range(beam_size))[np.newaxis, :]).flatten()
@@ -112,8 +172,27 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
         trg_src_attn_bias = np.tile(trg_src_attn_bias[
             active_beams_indice, :, ::trg_src_attn_bias.shape[2], :],
                                     [1, 1, trg_cur_len, 1])
+        # Append the shape input to reshape the output of embedding layer.
+        trg_data_shape = np.array(
+            [len(active_beams) * beam_size, trg_cur_len, d_model],
+            dtype="int32")
+        # Append the shape inputs to reshape before and after softmax in
+        # decoder self attention.
+        trg_slf_attn_pre_softmax_shape = np.array(
+            [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
+        trg_slf_attn_post_softmax_shape = np.array(
+            trg_slf_attn_bias.shape, dtype="int32")
+        # Append the shape inputs to reshape before and after softmax in
+        # encoder-decoder attention.
+        trg_src_attn_pre_softmax_shape = np.array(
+            [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
+        trg_src_attn_post_softmax_shape = np.array(
+            trg_src_attn_bias.shape, dtype="int32")
         enc_output = enc_output[active_beams_indice, :, :]
-        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output
+        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
+            trg_src_attn_post_softmax_shape, enc_output
 
     dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data,
                                    enc_output)
@@ -122,13 +201,18 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
                               feed=dict(zip(dec_in_names, dec_in_data)),
                               fetch_list=dec_out_names)[0]
         predict_all = np.log(
-            predict_all.reshape([len(beam_map) * beam_size, i + 1, -1])[:,
-                                                                        -1, :])
-        predict_all = (predict_all + scores[beam_map].reshape(
-            [len(beam_map) * beam_size, -1])).reshape(
-                [len(beam_map), beam_size, -1])
+            predict_all.reshape([len(beam_inst_map) * beam_size, i + 1, -1])
+            [:, -1, :])
+        predict_all = (predict_all + scores[active_beams].reshape(
+            [len(beam_inst_map) * beam_size, -1])).reshape(
+                [len(beam_inst_map), beam_size, -1])
+        if not output_unk:  # To exclude the <unk> token.
+            predict_all[:, :, unk_idx] = -1e9
         active_beams = []
-        for inst_idx, beam_idx in enumerate(beam_map):
+        for beam_idx in range(batch_size):
+            if not beam_inst_map.has_key(beam_idx):
+                continue
+            inst_idx = beam_inst_map[beam_idx]
             predict = (predict_all[inst_idx, :, :]
                        if i != 0 else predict_all[inst_idx, 0, :]).flatten()
             top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:]
@@ -141,13 +225,20 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
             next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1])
             if next_ids[beam_idx][-1][0] != eos_idx:
                 active_beams.append(beam_idx)
-        beam_map = active_beams
-        if len(beam_map) == 0:
+        if len(active_beams) == 0:
             break
-        dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams)
+        dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams,
+                                         beam_inst_map)
+        beam_inst_map = {
+            beam_idx: inst_idx
+            for inst_idx, beam_idx in enumerate(active_beams)
+        }
 
     # Decode beams and select n_best sequences for each instance by backtrace.
-    seqs = [beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best)]
+    seqs = [
+        beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best)
+        for beam_idx in range(batch_size)
+    ]
 
     return seqs, scores[:, :n_best].tolist()
 
@@ -155,29 +246,24 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
 def main():
     place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
     exe = fluid.Executor(place)
-    # The current program desc is coupled with batch_size and the only
-    # supported batch size is 1 currently.
+
     encoder_program = fluid.Program()
-    model.batch_size = InferTaskConfig.batch_size
     with fluid.program_guard(main_program=encoder_program):
         enc_output = encoder(
-            ModelHyperParams.src_vocab_size + 1,
-            ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
-            ModelHyperParams.n_head, ModelHyperParams.d_key,
-            ModelHyperParams.d_value, ModelHyperParams.d_model,
-            ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
-            ModelHyperParams.src_pad_idx, ModelHyperParams.pos_pad_idx)
-
-    model.batch_size = InferTaskConfig.batch_size * InferTaskConfig.beam_size
+            ModelHyperParams.src_vocab_size, ModelHyperParams.max_length + 1,
+            ModelHyperParams.n_layer, ModelHyperParams.n_head,
+            ModelHyperParams.d_key, ModelHyperParams.d_value,
+            ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+            ModelHyperParams.dropout)
+
     decoder_program = fluid.Program()
     with fluid.program_guard(main_program=decoder_program):
         predict = decoder(
-            ModelHyperParams.trg_vocab_size + 1,
-            ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
-            ModelHyperParams.n_head, ModelHyperParams.d_key,
-            ModelHyperParams.d_value, ModelHyperParams.d_model,
-            ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
-            ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+            ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1,
+            ModelHyperParams.n_layer, ModelHyperParams.n_head,
+            ModelHyperParams.d_key, ModelHyperParams.d_value,
+            ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+            ModelHyperParams.dropout)
 
     # Load model parameters of encoder and decoder separately from the saved
     # transformer model.
@@ -214,17 +300,51 @@ def main():
     trg_idx2word = paddle.dataset.wmt16.get_dict(
         "de", dict_size=ModelHyperParams.trg_vocab_size, reverse=True)
 
+    def post_process_seq(seq,
+                         bos_idx=ModelHyperParams.bos_idx,
+                         eos_idx=ModelHyperParams.eos_idx,
+                         output_bos=InferTaskConfig.output_bos,
+                         output_eos=InferTaskConfig.output_eos):
+        """
+        Post-process the beam-search decoded sequence. Truncate from the first
+        <eos> and remove the <bos> and <eos> tokens currently.
+        """
+        eos_pos = len(seq) - 1
+        for i, idx in enumerate(seq):
+            if idx == eos_idx:
+                eos_pos = i
+                break
+        seq = seq[:eos_pos + 1]
+        return filter(
+            lambda idx: (output_bos or idx != bos_idx) and \
+                (output_eos or idx != eos_idx),
+            seq)
+
     for batch_id, data in enumerate(test_data()):
         batch_seqs, batch_scores = translate_batch(
-            exe, [item[0] for item in data], encoder_program,
-            encoder_input_data_names, [enc_output.name], decoder_program,
-            decoder_input_data_names, [predict.name], InferTaskConfig.beam_size,
-            InferTaskConfig.max_length, InferTaskConfig.n_best,
-            len(data), ModelHyperParams.n_head, ModelHyperParams.src_pad_idx,
-            ModelHyperParams.trg_pad_idx, ModelHyperParams.bos_idx,
-            ModelHyperParams.eos_idx)
+            exe,
+            [item[0] for item in data],
+            encoder_program,
+            encoder_input_data_names,
+            [enc_output.name],
+            decoder_program,
+            decoder_input_data_names,
+            [predict.name],
+            InferTaskConfig.beam_size,
+            InferTaskConfig.max_length,
+            InferTaskConfig.n_best,
+            len(data),
+            ModelHyperParams.n_head,
+            ModelHyperParams.d_model,
+            ModelHyperParams.eos_idx,  # Use eos_idx to pad.
+            ModelHyperParams.eos_idx,  # Use eos_idx to pad.
+            ModelHyperParams.bos_idx,
+            ModelHyperParams.eos_idx,
+            ModelHyperParams.unk_idx,
+            output_unk=InferTaskConfig.output_unk)
         for i in range(len(batch_seqs)):
-            seqs = batch_seqs[i]
+            # Post-process the beam-search decoded sequences.
+            seqs = map(post_process_seq, batch_seqs[i])
             scores = batch_scores[i]
             for seq in seqs:
                 print(" ".join([trg_idx2word[idx] for idx in seq]))
diff --git a/fluid/neural_machine_translation/transformer/model.py b/fluid/neural_machine_translation/transformer/model.py
index ba5ba4470759da5fd2c6dd3b3d61b88c3468bd27..2b5519674282edb4d927f48b0a32eb82b459514d 100644
--- a/fluid/neural_machine_translation/transformer/model.py
+++ b/fluid/neural_machine_translation/transformer/model.py
@@ -7,9 +7,6 @@ import paddle.fluid.layers as layers
 from config import TrainTaskConfig, pos_enc_param_names, \
     encoder_input_data_names, decoder_input_data_names, label_data_names
 
-# FIXME(guosheng): Remove out the batch_size from the model.
-batch_size = TrainTaskConfig.batch_size
-
 
 def position_encoding_init(n_position, d_pos_vec):
     """
@@ -32,7 +29,9 @@ def multi_head_attention(queries,
                          d_value,
                          d_model,
                          n_head=1,
-                         dropout_rate=0.):
+                         dropout_rate=0.,
+                         pre_softmax_shape=None,
+                         post_softmax_shape=None):
     """
     Multi-Head Attention. Note that attn_bias is added to the logit before
     computing softmax activiation to mask certain selected positions so that
@@ -83,9 +82,10 @@ def multi_head_attention(queries,
             return x
 
         hidden_size = x.shape[-1]
-        # FIXME(guosheng): Decouple the program desc with batch_size.
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
         reshaped = layers.reshape(
-            x=x, shape=[batch_size, -1, n_head, hidden_size // n_head])
+            x=x, shape=[0, -1, n_head, hidden_size // n_head])
 
         # permuate the dimensions into:
         # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
@@ -101,36 +101,26 @@ def multi_head_attention(queries,
             raise ValueError("Input(x) should be a 4-D Tensor.")
 
         trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # FIXME(guosheng): Decouple the program desc with batch_size.
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
         return layers.reshape(
             x=trans_x,
-            shape=map(int,
-                      [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
+            shape=map(int, [0, -1, trans_x.shape[2] * trans_x.shape[3]]))
 
     def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
         """
         Scaled Dot-Product Attention
         """
-
-        # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op.
-
-        # The current implementation of softmax_op only supports 2D tensor,
-        # consequently it cannot be directly used here.
-        # If to use the reshape_op, Besides, the shape of product inferred in
-        # compile-time is not the actual shape in run-time. It cann't be used
-        # to set the attribute of reshape_op.
-        # So, here define the softmax for temporary solution.
-
-        def __softmax(x, eps=1e-9):
-            exp_out = layers.exp(x=x)
-            sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
-            return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
-
         scaled_q = layers.scale(x=q, scale=d_model**-0.5)
         product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-        weights = __softmax(
-            layers.elementwise_add(
-                x=product, y=attn_bias) if attn_bias else product)
+        weights = layers.reshape(
+            x=layers.elementwise_add(
+                x=product, y=attn_bias) if attn_bias else product,
+            shape=[-1, product.shape[-1]],
+            actual_shape=pre_softmax_shape,
+            act="softmax")
+        weights = layers.reshape(
+            x=weights, shape=product.shape, actual_shape=post_softmax_shape)
         if dropout_rate:
             weights = layers.dropout(
                 weights, dropout_prob=dropout_rate, is_test=False)
@@ -177,7 +167,7 @@ def positionwise_feed_forward(x, d_inner_hid, d_hid):
     return out
 
 
-def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.):
+def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
     """
     Add residual connection, layer normalization and droput to the out tensor
     optionally according to the value of process_cmd.
@@ -195,8 +185,9 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.):
                 param_attr=fluid.initializer.Constant(1.),
                 bias_attr=fluid.initializer.Constant(0.))
         elif cmd == "d":  # add dropout
-            if dropout:
-                out = layers.dropout(out, dropout_prob=dropout, is_test=False)
+            if dropout_rate:
+                out = layers.dropout(
+                    out, dropout_prob=dropout_rate, is_test=False)
     return out
 
 
@@ -208,10 +199,9 @@ def prepare_encoder(src_word,
                     src_pos,
                     src_vocab_size,
                     src_emb_dim,
-                    src_pad_idx,
                     src_max_len,
-                    dropout=0.,
-                    pos_pad_idx=0,
+                    dropout_rate=0.,
+                    src_data_shape=None,
                     pos_enc_param_name=None):
     """Add word embeddings and position encodings.
     The output tensor has a shape of:
@@ -222,21 +212,20 @@ def prepare_encoder(src_word,
     src_word_emb = layers.embedding(
         src_word,
         size=[src_vocab_size, src_emb_dim],
-        padding_idx=src_pad_idx,
         param_attr=fluid.initializer.Normal(0., 1.))
     src_pos_enc = layers.embedding(
         src_pos,
         size=[src_max_len, src_emb_dim],
-        padding_idx=pos_pad_idx,
         param_attr=fluid.ParamAttr(
             name=pos_enc_param_name, trainable=False))
     enc_input = src_word_emb + src_pos_enc
-
-    # FIXME(guosheng): Decouple the program desc with batch_size.
-    enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim])
+    enc_input = layers.reshape(
+        x=enc_input,
+        shape=[-1, src_max_len, src_emb_dim],
+        actual_shape=src_data_shape)
     return layers.dropout(
-        enc_input, dropout_prob=dropout,
-        is_test=False) if dropout else enc_input
+        enc_input, dropout_prob=dropout_rate,
+        is_test=False) if dropout_rate else enc_input
 
 
 prepare_encoder = partial(
@@ -252,7 +241,9 @@ def encoder_layer(enc_input,
                   d_value,
                   d_model,
                   d_inner_hid,
-                  dropout_rate=0.):
+                  dropout_rate=0.,
+                  pre_softmax_shape=None,
+                  post_softmax_shape=None):
     """The encoder layers that can be stacked to form a deep encoder.
 
     This module consits of a multi-head (self) attention followed by
@@ -260,9 +251,9 @@ def encoder_layer(enc_input,
     with the post_process_layer to add residual connection, layer normalization
     and droput.
     """
-    attn_output = multi_head_attention(enc_input, enc_input, enc_input,
-                                       attn_bias, d_key, d_value, d_model,
-                                       n_head, dropout_rate)
+    attn_output = multi_head_attention(
+        enc_input, enc_input, enc_input, attn_bias, d_key, d_value, d_model,
+        n_head, dropout_rate, pre_softmax_shape, post_softmax_shape)
     attn_output = post_process_layer(enc_input, attn_output, "dan",
                                      dropout_rate)
     ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model)
@@ -277,7 +268,9 @@ def encoder(enc_input,
             d_value,
             d_model,
             d_inner_hid,
-            dropout_rate=0.):
+            dropout_rate=0.,
+            pre_softmax_shape=None,
+            post_softmax_shape=None):
     """
     The encoder is composed of a stack of identical layers returned by calling
     encoder_layer.
@@ -291,7 +284,9 @@ def encoder(enc_input,
             d_value,
             d_model,
             d_inner_hid,
-            dropout_rate, )
+            dropout_rate,
+            pre_softmax_shape,
+            post_softmax_shape, )
         enc_input = enc_output
     return enc_output
 
@@ -305,7 +300,11 @@ def decoder_layer(dec_input,
                   d_value,
                   d_model,
                   d_inner_hid,
-                  dropout_rate=0.):
+                  dropout_rate=0.,
+                  slf_attn_pre_softmax_shape=None,
+                  slf_attn_post_softmax_shape=None,
+                  src_attn_pre_softmax_shape=None,
+                  src_attn_post_softmax_shape=None):
     """ The layer to be stacked in decoder part.
 
     The structure of this module is similar to that in the encoder part except
@@ -320,7 +319,9 @@ def decoder_layer(dec_input,
         d_value,
         d_model,
         n_head,
-        dropout_rate, )
+        dropout_rate,
+        slf_attn_pre_softmax_shape,
+        slf_attn_post_softmax_shape, )
     slf_attn_output = post_process_layer(
         dec_input,
         slf_attn_output,
@@ -335,7 +336,9 @@ def decoder_layer(dec_input,
         d_value,
         d_model,
         n_head,
-        dropout_rate, )
+        dropout_rate,
+        src_attn_pre_softmax_shape,
+        src_attn_post_softmax_shape, )
     enc_attn_output = post_process_layer(
         slf_attn_output,
         enc_attn_output,
@@ -363,7 +366,11 @@ def decoder(dec_input,
             d_value,
             d_model,
             d_inner_hid,
-            dropout_rate=0.):
+            dropout_rate=0.,
+            slf_attn_pre_softmax_shape=None,
+            slf_attn_post_softmax_shape=None,
+            src_attn_pre_softmax_shape=None,
+            src_attn_post_softmax_shape=None):
     """
     The decoder is composed of a stack of identical decoder_layer layers.
     """
@@ -378,7 +385,11 @@ def decoder(dec_input,
             d_value,
             d_model,
             d_inner_hid,
-            dropout_rate, )
+            dropout_rate,
+            slf_attn_pre_softmax_shape,
+            slf_attn_post_softmax_shape,
+            src_attn_pre_softmax_shape,
+            src_attn_post_softmax_shape, )
         dec_input = dec_output
     return dec_output
 
@@ -386,18 +397,23 @@ def decoder(dec_input,
 def make_inputs(input_data_names,
                 n_head,
                 d_model,
-                batch_size,
                 max_length,
                 is_pos,
                 slf_attn_bias_flag,
                 src_attn_bias_flag,
-                enc_output_flag=False):
+                enc_output_flag=False,
+                data_shape_flag=True,
+                slf_attn_shape_flag=True,
+                src_attn_shape_flag=True):
     """
     Define the input data layers for the transformer model.
     """
     input_layers = []
-    # The shapes here act as placeholder.
-    # The shapes set here is to pass the infer-shape in compile time.
+    batch_size = 1  # Only for the infer-shape in compile time.
+    # The shapes here act as placeholder and are set to pass the infer-shape in
+    # compile time.
+    # The actual data shape of word is:
+    # [batch_size * max_len_in_batch, 1]
     word = layers.data(
         name=input_data_names[len(input_layers)],
         shape=[batch_size * max_length, 1],
@@ -405,6 +421,8 @@ def make_inputs(input_data_names,
         append_batch_size=False)
     input_layers += [word]
     # This is used for position data or label weight.
+    # The actual data shape of pos is:
+    # [batch_size * max_len_in_batch, 1]
     pos = layers.data(
         name=input_data_names[len(input_layers)],
         shape=[batch_size * max_length, 1],
@@ -415,6 +433,8 @@ def make_inputs(input_data_names,
         # This input is used to remove attention weights on paddings for the
         # encoder and to remove attention weights on subsequent words for the
         # decoder.
+        # The actual data shape of slf_attn_bias_flag is:
+        # [batch_size, n_head, max_len_in_batch, max_len_in_batch]
         slf_attn_bias = layers.data(
             name=input_data_names[len(input_layers)],
             shape=[batch_size, n_head, max_length, max_length],
@@ -422,20 +442,67 @@ def make_inputs(input_data_names,
             append_batch_size=False)
         input_layers += [slf_attn_bias]
     if src_attn_bias_flag:
-        # This input is used to remove attention weights on paddings.
+        # This input is used to remove attention weights on paddings. It's used
+        # in encoder-decoder attention.
+        # The actual data shape of slf_attn_bias_flag is:
+        # [batch_size, n_head, trg_max_len_in_batch, src_max_len_in_batch]
         src_attn_bias = layers.data(
             name=input_data_names[len(input_layers)],
             shape=[batch_size, n_head, max_length, max_length],
             dtype="float32",
             append_batch_size=False)
         input_layers += [src_attn_bias]
+    if data_shape_flag:
+        # This input is used to reshape the output of embedding layer.
+        data_shape = layers.data(
+            name=input_data_names[len(input_layers)],
+            shape=[3],
+            dtype="int32",
+            append_batch_size=False)
+        input_layers += [data_shape]
+    if slf_attn_shape_flag:
+        # This shape input is used to reshape before softmax in self attention.
+        slf_attn_pre_softmax_shape = layers.data(
+            name=input_data_names[len(input_layers)],
+            shape=[2],
+            dtype="int32",
+            append_batch_size=False)
+        input_layers += [slf_attn_pre_softmax_shape]
+        # This shape input is used to reshape after softmax in self attention.
+        slf_attn_post_softmax_shape = layers.data(
+            name=input_data_names[len(input_layers)],
+            shape=[4],
+            dtype="int32",
+            append_batch_size=False)
+        input_layers += [slf_attn_post_softmax_shape]
+    if src_attn_shape_flag:
+        # This shape input is used to reshape before softmax in encoder-decoder
+        # attention.
+        src_attn_pre_softmax_shape = layers.data(
+            name=input_data_names[len(input_layers)],
+            shape=[2],
+            dtype="int32",
+            append_batch_size=False)
+        input_layers += [src_attn_pre_softmax_shape]
+        # This shape input is used to reshape after softmax in encoder-decoder
+        # attention.
+        src_attn_post_softmax_shape = layers.data(
+            name=input_data_names[len(input_layers)],
+            shape=[4],
+            dtype="int32",
+            append_batch_size=False)
+        input_layers += [src_attn_post_softmax_shape]
     if enc_output_flag:
+        # This input is used in independent decoder program for inference.
+        # The actual data shape of slf_attn_bias_flag is:
+        # [batch_size, max_len_in_batch, d_model]
         enc_output = layers.data(
             name=input_data_names[len(input_layers)],
             shape=[batch_size, max_length, d_model],
             dtype="float32",
             append_batch_size=False)
         input_layers += [enc_output]
+
     return input_layers
 
 
@@ -449,12 +516,19 @@ def transformer(
         d_value,
         d_model,
         d_inner_hid,
-        dropout_rate,
-        src_pad_idx,
-        trg_pad_idx,
-        pos_pad_idx, ):
-    enc_input_layers = make_inputs(encoder_input_data_names, n_head, d_model,
-                                   batch_size, max_length, True, True, False)
+        dropout_rate, ):
+    enc_inputs = make_inputs(
+        encoder_input_data_names,
+        n_head,
+        d_model,
+        max_length,
+        is_pos=True,
+        slf_attn_bias_flag=True,
+        src_attn_bias_flag=False,
+        enc_output_flag=False,
+        data_shape_flag=True,
+        slf_attn_shape_flag=True,
+        src_attn_shape_flag=False)
 
     enc_output = wrap_encoder(
         src_vocab_size,
@@ -466,12 +540,20 @@ def transformer(
         d_model,
         d_inner_hid,
         dropout_rate,
-        src_pad_idx,
-        pos_pad_idx,
-        enc_input_layers, )
+        enc_inputs, )
 
-    dec_input_layers = make_inputs(decoder_input_data_names, n_head, d_model,
-                                   batch_size, max_length, True, True, True)
+    dec_inputs = make_inputs(
+        decoder_input_data_names,
+        n_head,
+        d_model,
+        max_length,
+        is_pos=True,
+        slf_attn_bias_flag=True,
+        src_attn_bias_flag=True,
+        enc_output_flag=False,
+        data_shape_flag=True,
+        slf_attn_shape_flag=True,
+        src_attn_shape_flag=True)
 
     predict = wrap_decoder(
         trg_vocab_size,
@@ -483,18 +565,29 @@ def transformer(
         d_model,
         d_inner_hid,
         dropout_rate,
-        trg_pad_idx,
-        pos_pad_idx,
-        dec_input_layers,
+        dec_inputs,
         enc_output, )
 
     # Padding index do not contribute to the total loss. The weights is used to
     # cancel padding index in calculating the loss.
-    gold, weights = make_inputs(label_data_names, n_head, d_model, batch_size,
-                                max_length, False, False, False)
-    cost = layers.cross_entropy(input=predict, label=gold)
+    gold, weights = make_inputs(
+        label_data_names,
+        n_head,
+        d_model,
+        max_length,
+        is_pos=False,
+        slf_attn_bias_flag=False,
+        src_attn_bias_flag=False,
+        enc_output_flag=False,
+        data_shape_flag=False,
+        slf_attn_shape_flag=False,
+        src_attn_shape_flag=False)
+    cost = layers.softmax_with_cross_entropy(logits=predict, label=gold)
     weighted_cost = cost * weights
-    return layers.reduce_sum(weighted_cost), predict
+    sum_cost = layers.reduce_sum(weighted_cost)
+    token_num = layers.reduce_sum(weights)
+    avg_cost = sum_cost / token_num
+    return sum_cost, avg_cost, predict, token_num
 
 
 def wrap_encoder(src_vocab_size,
@@ -506,27 +599,38 @@ def wrap_encoder(src_vocab_size,
                  d_model,
                  d_inner_hid,
                  dropout_rate,
-                 src_pad_idx,
-                 pos_pad_idx,
-                 enc_input_layers=None):
+                 enc_inputs=None):
     """
     The wrapper assembles together all needed layers for the encoder.
     """
-    if enc_input_layers is None:
+    if enc_inputs is None:
         # This is used to implement independent encoder program in inference.
-        src_word, src_pos, src_slf_attn_bias = make_inputs(
-            encoder_input_data_names, n_head, d_model, batch_size, max_length,
-            True, True, False)
+        src_word, src_pos, src_slf_attn_bias, src_data_shape, \
+            slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
+            make_inputs(
+                encoder_input_data_names,
+                n_head,
+                d_model,
+                max_length,
+                is_pos=True,
+                slf_attn_bias_flag=True,
+                src_attn_bias_flag=False,
+                enc_output_flag=False,
+                data_shape_flag=True,
+                slf_attn_shape_flag=True,
+                src_attn_shape_flag=False)
     else:
-        src_word, src_pos, src_slf_attn_bias = enc_input_layers
+        src_word, src_pos, src_slf_attn_bias, src_data_shape, \
+            slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
+            enc_inputs
     enc_input = prepare_encoder(
         src_word,
         src_pos,
         src_vocab_size,
         d_model,
-        src_pad_idx,
         max_length,
-        dropout_rate, )
+        dropout_rate,
+        src_data_shape, )
     enc_output = encoder(
         enc_input,
         src_slf_attn_bias,
@@ -536,7 +640,9 @@ def wrap_encoder(src_vocab_size,
         d_value,
         d_model,
         d_inner_hid,
-        dropout_rate, )
+        dropout_rate,
+        slf_attn_pre_softmax_shape,
+        slf_attn_post_softmax_shape, )
     return enc_output
 
 
@@ -549,29 +655,42 @@ def wrap_decoder(trg_vocab_size,
                  d_model,
                  d_inner_hid,
                  dropout_rate,
-                 trg_pad_idx,
-                 pos_pad_idx,
-                 dec_input_layers=None,
+                 dec_inputs=None,
                  enc_output=None):
     """
     The wrapper assembles together all needed layers for the decoder.
     """
-    if dec_input_layers is None:
+    if dec_inputs is None:
         # This is used to implement independent decoder program in inference.
-        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = make_inputs(
-            decoder_input_data_names, n_head, d_model, batch_size, max_length,
-            True, True, True, True)
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, slf_attn_pre_softmax_shape, \
+            slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
+            src_attn_post_softmax_shape, enc_output = make_inputs(
+                decoder_input_data_names,
+                n_head,
+                d_model,
+                max_length,
+                is_pos=True,
+                slf_attn_bias_flag=True,
+                src_attn_bias_flag=True,
+                enc_output_flag=True,
+                data_shape_flag=True,
+                slf_attn_shape_flag=True,
+                src_attn_shape_flag=True)
     else:
-        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_input_layers
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+            trg_data_shape, slf_attn_pre_softmax_shape, \
+            slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
+            src_attn_post_softmax_shape = dec_inputs
 
     dec_input = prepare_decoder(
         trg_word,
         trg_pos,
         trg_vocab_size,
         d_model,
-        trg_pad_idx,
         max_length,
-        dropout_rate, )
+        dropout_rate,
+        trg_data_shape, )
     dec_output = decoder(
         dec_input,
         enc_output,
@@ -583,13 +702,17 @@ def wrap_decoder(trg_vocab_size,
         d_value,
         d_model,
         d_inner_hid,
-        dropout_rate, )
-
+        dropout_rate,
+        slf_attn_pre_softmax_shape,
+        slf_attn_post_softmax_shape,
+        src_attn_pre_softmax_shape,
+        src_attn_post_softmax_shape, )
+    # Return logits for training and probs for inference.
     predict = layers.reshape(
         x=layers.fc(input=dec_output,
                     size=trg_vocab_size,
                     bias_attr=False,
                     num_flatten_dims=2),
         shape=[-1, trg_vocab_size],
-        act="softmax")
+        act="softmax" if dec_inputs is None else None)
     return predict
diff --git a/fluid/neural_machine_translation/transformer/train.py b/fluid/neural_machine_translation/transformer/train.py
index 65de8ef7fa8421bd72175175f1cf421a4237ddd5..ffbc4bd57a3a06531e36e3eddc142febf2c57d02 100644
--- a/fluid/neural_machine_translation/transformer/train.py
+++ b/fluid/neural_machine_translation/transformer/train.py
@@ -1,7 +1,8 @@
 import os
+import time
 import numpy as np
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 from model import transformer, position_encoding_init
@@ -14,7 +15,7 @@ def pad_batch_data(insts,
                    pad_idx,
                    n_head,
                    is_target=False,
-                   return_pos=True,
+                   is_label=False,
                    return_attn_bias=True,
                    return_max_len=True):
     """
@@ -23,14 +24,20 @@ def pad_batch_data(insts,
     """
     return_list = []
     max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
     inst_data = np.array(
         [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
     return_list += [inst_data.astype("int64").reshape([-1, 1])]
-    if return_pos:
-        inst_pos = np.array([[
-            pos_i + 1 if w_i != pad_idx else 0 for pos_i, w_i in enumerate(inst)
-        ] for inst in inst_data])
-
+    if is_label:  # label weight
+        inst_weight = np.array(
+            [[1.] * len(inst) + [0.] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_weight.astype("float32").reshape([-1, 1])]
+    else:  # position data
+        inst_pos = np.array([
+            range(1, len(inst) + 1) + [0] * (max_len - len(inst))
+            for inst in insts
+        ])
         return_list += [inst_pos.astype("int64").reshape([-1, 1])]
     if return_attn_bias:
         if is_target:
@@ -56,7 +63,7 @@ def pad_batch_data(insts,
 
 
 def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
-                        max_length, n_head):
+                        n_head, d_model):
     """
     Put all padded data needed by training into a dict.
     """
@@ -66,13 +73,40 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
         [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True)
     trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
                                 [1, 1, trg_max_len, 1]).astype("float32")
-    lbl_word = pad_batch_data([inst[2] for inst in insts], trg_pad_idx, n_head,
-                              False, False, False, False)
-    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+
+    # These shape tensors are used in reshape_op.
+    src_data_shape = np.array([len(insts), src_max_len, d_model], dtype="int32")
+    trg_data_shape = np.array([len(insts), trg_max_len, d_model], dtype="int32")
+    src_slf_attn_pre_softmax_shape = np.array(
+        [-1, src_slf_attn_bias.shape[-1]], dtype="int32")
+    src_slf_attn_post_softmax_shape = np.array(
+        src_slf_attn_bias.shape, dtype="int32")
+    trg_slf_attn_pre_softmax_shape = np.array(
+        [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
+    trg_slf_attn_post_softmax_shape = np.array(
+        trg_slf_attn_bias.shape, dtype="int32")
+    trg_src_attn_pre_softmax_shape = np.array(
+        [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
+    trg_src_attn_post_softmax_shape = np.array(
+        trg_src_attn_bias.shape, dtype="int32")
+
+    lbl_word, lbl_weight = pad_batch_data(
+        [inst[2] for inst in insts],
+        trg_pad_idx,
+        n_head,
+        is_target=False,
+        is_label=True,
+        return_attn_bias=False,
+        return_max_len=False)
+
     input_dict = dict(
         zip(input_data_names, [
-            src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
-            trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+            src_word, src_pos, src_slf_attn_bias, src_data_shape,
+            src_slf_attn_pre_softmax_shape, src_slf_attn_post_softmax_shape,
+            trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias,
+            trg_data_shape, trg_slf_attn_pre_softmax_shape,
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape,
+            trg_src_attn_post_softmax_shape, lbl_word, lbl_weight
         ]))
     return input_dict
 
@@ -81,14 +115,12 @@ def main():
     place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    cost, predict = transformer(
-        ModelHyperParams.src_vocab_size + 1,
-        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
-        ModelHyperParams.n_layer, ModelHyperParams.n_head,
-        ModelHyperParams.d_key, ModelHyperParams.d_value,
-        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
-        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
-        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+    sum_cost, avg_cost, predict, token_num = transformer(
+        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
+        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
+        ModelHyperParams.n_head, ModelHyperParams.d_key,
+        ModelHyperParams.d_value, ModelHyperParams.d_model,
+        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout)
 
     lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
                                          TrainTaskConfig.warmup_steps, place,
@@ -98,7 +130,7 @@ def main():
         beta1=TrainTaskConfig.beta1,
         beta2=TrainTaskConfig.beta2,
         epsilon=TrainTaskConfig.eps)
-    optimizer.minimize(cost)
+    optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
@@ -110,27 +142,31 @@ def main():
     # Program to do validation.
     test_program = fluid.default_main_program().clone()
     with fluid.program_guard(test_program):
-        test_program = fluid.io.get_inference_program([cost])
+        test_program = fluid.io.get_inference_program([avg_cost])
     val_data = paddle.batch(
         paddle.dataset.wmt16.validation(ModelHyperParams.src_vocab_size,
                                         ModelHyperParams.trg_vocab_size),
         batch_size=TrainTaskConfig.batch_size)
 
     def test(exe):
-        test_costs = []
+        test_total_cost = 0
+        test_total_token = 0
         for batch_id, data in enumerate(val_data()):
-            if len(data) != TrainTaskConfig.batch_size:
-                continue
             data_input = prepare_batch_input(
                 data, encoder_input_data_names + decoder_input_data_names[:-1] +
-                label_data_names, ModelHyperParams.src_pad_idx,
-                ModelHyperParams.trg_pad_idx, ModelHyperParams.max_length,
-                ModelHyperParams.n_head)
-            test_cost = exe.run(test_program,
-                                feed=data_input,
-                                fetch_list=[cost])[0]
-            test_costs.append(test_cost)
-        return np.mean(test_costs)
+                label_data_names, ModelHyperParams.eos_idx,
+                ModelHyperParams.eos_idx, ModelHyperParams.n_head,
+                ModelHyperParams.d_model)
+            test_sum_cost, test_token_num = exe.run(
+                test_program,
+                feed=data_input,
+                fetch_list=[sum_cost, token_num],
+                use_program_cache=True)
+            test_total_cost += test_sum_cost
+            test_total_token += test_token_num
+        test_avg_cost = test_total_cost / test_total_token
+        test_ppl = np.exp([min(test_avg_cost, 100)])
+        return test_avg_cost, test_ppl
 
     # Initialize the parameters.
     exe.run(fluid.framework.default_startup_program())
@@ -142,27 +178,30 @@ def main():
                                    ModelHyperParams.d_model), place)
 
     for pass_id in xrange(TrainTaskConfig.pass_num):
+        pass_start_time = time.time()
         for batch_id, data in enumerate(train_data()):
-            # The current program desc is coupled with batch_size, thus all
-            # mini-batches must have the same number of instances currently.
             if len(data) != TrainTaskConfig.batch_size:
                 continue
             data_input = prepare_batch_input(
                 data, encoder_input_data_names + decoder_input_data_names[:-1] +
-                label_data_names, ModelHyperParams.src_pad_idx,
-                ModelHyperParams.trg_pad_idx, ModelHyperParams.max_length,
-                ModelHyperParams.n_head)
+                label_data_names, ModelHyperParams.eos_idx,
+                ModelHyperParams.eos_idx, ModelHyperParams.n_head,
+                ModelHyperParams.d_model)
             lr_scheduler.update_learning_rate(data_input)
             outs = exe.run(fluid.framework.default_main_program(),
                            feed=data_input,
-                           fetch_list=[cost],
+                           fetch_list=[sum_cost, avg_cost],
                            use_program_cache=True)
-            cost_val = np.array(outs[0])
-            print("pass_id = " + str(pass_id) + " batch = " + str(batch_id) +
-                  " cost = " + str(cost_val))
+            sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1])
+            print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
+                  (pass_id, batch_id, sum_cost_val, avg_cost_val,
+                   np.exp([min(avg_cost_val[0], 100)])))
         # Validate and save the model for inference.
-        val_cost = test(exe)
-        print("pass_id = " + str(pass_id) + " val_cost = " + str(val_cost))
+        val_avg_cost, val_ppl = test(exe)
+        pass_end_time = time.time()
+        time_consumed = pass_end_time - pass_start_time
+        print("epoch: %d, val avg loss: %f, val ppl: %f, "
+              "consumed %fs" % (pass_id, val_avg_cost, val_ppl, time_consumed))
         fluid.io.save_inference_model(
             os.path.join(TrainTaskConfig.model_dir,
                          "pass_" + str(pass_id) + ".infer.model"),
diff --git a/fluid/object_detection/.gitignore b/fluid/object_detection/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b68dc43d08fbc2415a7c099112350ca940d6519c
--- /dev/null
+++ b/fluid/object_detection/.gitignore
@@ -0,0 +1,9 @@
+./data/pascalvoc/VOCdevkit/
+data/pascalvoc/test.txt
+data/pascalvoc/trainval.txt
+pretrained/ssd_mobilenet_v1_coco.tar.gz
+pretrained/ssd_mobilenet_v1_coco
+pretrained/mobilenet_v1_imagenet.tar.gz
+pretrained/mobilenet_v1_imagenet
+log*
+*.log
diff --git a/fluid/object_detection/README.md b/fluid/object_detection/README.md
index 4aa2c32865932bb949e20e32b63fc5cec2669dd0..67eccaed7303ad9bd6d5386729ae676eee52663f 100644
--- a/fluid/object_detection/README.md
+++ b/fluid/object_detection/README.md
@@ -2,7 +2,99 @@ The minimum PaddlePaddle version needed for the code sample in this directory is
 
 ---
 
-# MobileNet-SSD
+## SSD Object Detection
 
-This model built with paddle fluid is still under active development and is not
-the final version. We welcome feedbacks.
+### Introduction
+
+[Single Shot MultiBox Detector (SSD)](https://arxiv.org/abs/1512.02325) framework for object detection is based on a feed-forward convolutional network. The early network is a standard convolutional architecture for image classification, such as VGG, ResNet, or MobileNet, which is als called base network. In this tutorial we used [MobileNet](https://arxiv.org/abs/1704.04861).
+
+### Data Preparation
+
+You can use [PASCAL VOC dataset](http://host.robots.ox.ac.uk/pascal/VOC/) or [MS-COCO dataset](http://cocodataset.org/#download).
+
+#### PASCAL VOC Dataset
+
+If you want to train model on PASCAL VOC dataset, please download datset at first, skip this step if you already have one.
+
+```bash
+cd data/pascalvoc
+./download.sh
+```
+
+The command `download.sh` also will create training and testing file lists.
+
+#### MS-COCO Dataset
+
+If you want to train model on MS-COCO dataset, please download datset at first, skip this step if you already have one.
+
+```
+cd data/coco
+./download.sh
+```
+
+### Train
+
+#### Download the Pre-trained Model.
+
+We provide two pre-trained models. The one is MobileNet-v1 SSD trained on COCO dataset, but removed the convolutional predictors for COCO dataset. This model can be used to initialize the models when training other dataset, like PASCAL VOC. Then other pre-trained model is MobileNet v1 trained on ImageNet 2012 dataset, but removed the last weights and bias in Fully-Connected layer.
+
+Declaration: the MobileNet-v1 SSD model is converted by [TensorFlow model](https://github.com/tensorflow/models/blob/f87a58cd96d45de73c9a8330a06b2ab56749a7fa/research/object_detection/g3doc/detection_model_zoo.md). The MobileNet v1 model is converted [Caffe](https://github.com/shicai/MobileNet-Caffe).
+
+  - Download MobileNet-v1 SSD:
+    ```
+    ./pretrained/download_coco.sh
+    ```
+  - Download MobileNet-v1:
+    ```
+    ./pretrained/download_imagenet.sh
+    ```
+
+#### Train on PASCAL VOC
+  - Train on one device (/GPU).
+  ```python
+  env CUDA_VISIABLE_DEVICES=0 python -u train.py --parallel=False --data='pascalvoc' --pretrained_model='pretrained/ssd_mobilenet_v1_coco/'
+  ```
+  - Train on multi devices (/GPUs).
+
+  ```python
+  env CUDA_VISIABLE_DEVICES=0,1 python -u train.py --batch_size=64 --data='pascalvoc' --pretrained_model='pretrained/ssd_mobilenet_v1_coco/'
+  ```
+
+#### Train on MS-COCO
+  - Train on one device (/GPU).
+  ```python
+  env CUDA_VISIABLE_DEVICES=0 python -u train.py --parallel=False --data='coco' --pretrained_model='pretrained/mobilenet_imagenet/'
+  ```
+  - Train on multi devices (/GPUs).
+  ```python
+  env CUDA_VISIABLE_DEVICES=0,1 python -u train.py --batch_size=64 --data='coco' --pretrained_model='pretrained/mobilenet_imagenet/'
+  ```
+
+TBD
+
+### Evaluate
+
+```python
+env CUDA_VISIABLE_DEVICES=0 python eval.py --model='model/90' --test_list=''
+```
+
+TBD
+
+### Infer and Visualize
+
+```python
+env CUDA_VISIABLE_DEVICES=0 python infer.py --batch_size=2 --model='model/90' --test_list=''
+```
+
+TBD
+
+### Released Model
+
+
+| Model                    | Pre-trained Model  | Training data    | Test data    | mAP |
+|:------------------------:|:------------------:|:----------------:|:------------:|:----:|
+|MobileNet-v1-SSD 300x300  | COCO MobileNet SSD | VOC07+12 trainval| VOC07 test   | xx%  |
+|MobileNet-v1-SSD 300x300  | ImageNet MobileNet | VOC07+12 trainval| VOC07 test   | xx%  |
+|MobileNet-v1-SSD 300x300  | ImageNet MobileNet | MS-COCO trainval | MS-COCO test | xx%  |
+
+TBD
diff --git a/fluid/object_detection/data/prepare_voc_data.py b/fluid/object_detection/data/pascalvoc/create_list.py
similarity index 96%
rename from fluid/object_detection/data/prepare_voc_data.py
rename to fluid/object_detection/data/pascalvoc/create_list.py
index a652956e91ab8277bc6670d4dc85905fc52a3203..1f53b182fdab937c250945fdb8ee1da8cd85f46e 100644
--- a/fluid/object_detection/data/prepare_voc_data.py
+++ b/fluid/object_detection/data/pascalvoc/create_list.py
@@ -60,4 +60,5 @@ def prepare_filelist(devkit_dir, years, output_dir):
             ftest.write(item[0] + ' ' + item[1] + '\n')
 
 
-prepare_filelist(devkit_dir, years, '.')
+if __name__ == '__main__':
+    prepare_filelist(devkit_dir, years, '.')
diff --git a/fluid/object_detection/data/pascalvoc/download.sh b/fluid/object_detection/data/pascalvoc/download.sh
new file mode 100755
index 0000000000000000000000000000000000000000..55bbb0e5a43f937ee478c9502444b22c493890ae
--- /dev/null
+++ b/fluid/object_detection/data/pascalvoc/download.sh
@@ -0,0 +1,16 @@
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd "$DIR"
+
+# Download the data.
+echo "Downloading..."
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
+# Extract the data.
+echo "Extractint..."
+tar -xf VOCtrainval_11-May-2012.tar
+tar -xf VOCtrainval_06-Nov-2007.tar
+tar -xf VOCtest_06-Nov-2007.tar
+
+echo "Creating data lists..."
+python create_list.py
diff --git a/fluid/object_detection/data/label_list b/fluid/object_detection/data/pascalvoc/label_list
similarity index 100%
rename from fluid/object_detection/data/label_list
rename to fluid/object_detection/data/pascalvoc/label_list
diff --git a/fluid/object_detection/eval.py b/fluid/object_detection/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..84fbcf82c9862ed0c9e74a4ba5bacd50372ad7ab
--- /dev/null
+++ b/fluid/object_detection/eval.py
@@ -0,0 +1,106 @@
+import os
+import time
+import numpy as np
+import argparse
+import functools
+
+import paddle
+import paddle.fluid as fluid
+import reader
+from mobilenet_ssd import mobile_net
+from utility import add_arguments, print_arguments
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('dataset',          str, 'pascalvoc', "coco or pascalvoc.")
+add_arg('batch_size',       int,   32,        "Minibatch size.")
+add_arg('use_gpu',          bool,  True,      "Whether to use GPU or not.")
+add_arg('data_dir',         str,   '',        "The data root path.")
+add_arg('test_list',        str,   '',        "The testing data lists.")
+add_arg('label_file',       str,   '',        "The label file, which save the real name and is only used for Pascal VOC.")
+add_arg('model_dir',        str,   '',        "The model path.")
+add_arg('ap_version',       str,  '11point',  "11point or integral")
+add_arg('resize_h',         int,  300,         "The resized image height.")
+add_arg('resize_w',         int,  300,         "The resized image width.")
+add_arg('mean_value_B',     float, 127.5,      "mean value for B channel which will be subtracted")  #123.68
+add_arg('mean_value_G',     float, 127.5,      "mean value for G channel which will be subtracted")  #116.78
+add_arg('mean_value_R',     float, 127.5,      "mean value for R channel which will be subtracted")  #103.94
+# yapf: enable
+
+
+def eval(args, data_args, test_list, batch_size, model_dir=None):
+    image_shape = [3, data_args.resize_h, data_args.resize_w]
+    if data_args.dataset == 'coco':
+        num_classes = 81
+    elif data_args.dataset == 'pascalvoc':
+        num_classes = 21
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    gt_box = fluid.layers.data(
+        name='gt_box', shape=[4], dtype='float32', lod_level=1)
+    gt_label = fluid.layers.data(
+        name='gt_label', shape=[1], dtype='int32', lod_level=1)
+    difficult = fluid.layers.data(
+        name='gt_difficult', shape=[1], dtype='int32', lod_level=1)
+
+    locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
+    nmsed_out = fluid.layers.detection_output(
+        locs, confs, box, box_var, nms_threshold=0.45)
+    loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, box_var)
+    loss = fluid.layers.reduce_sum(loss)
+
+    test_program = fluid.default_main_program().clone(for_test=True)
+    with fluid.program_guard(test_program):
+        map_eval = fluid.evaluator.DetectionMAP(
+            nmsed_out,
+            gt_label,
+            gt_box,
+            difficult,
+            num_classes,
+            overlap_threshold=0.5,
+            evaluate_difficult=False,
+            ap_version=args.ap_version)
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    if model_dir:
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(model_dir, var.name))
+
+        fluid.io.load_vars(exe, model_dir, predicate=if_exist)
+
+    test_reader = paddle.batch(
+        reader.test(data_args, test_list), batch_size=batch_size)
+    feeder = fluid.DataFeeder(
+        place=place, feed_list=[image, gt_box, gt_label, difficult])
+
+    _, accum_map = map_eval.get_map_var()
+    map_eval.reset(exe)
+    for idx, data in enumerate(test_reader()):
+        test_map = exe.run(test_program,
+                           feed=feeder.feed(data),
+                           fetch_list=[accum_map])
+        if idx % 50 == 0:
+            print("Batch {0}, map {1}".format(idx, test_map[0]))
+    print("Test model {0}, map {1}".format(model_dir, test_map[0]))
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    data_args = reader.Settings(
+        dataset=args.dataset,
+        data_dir=args.data_dir,
+        label_file=args.label_file,
+        resize_h=args.resize_h,
+        resize_w=args.resize_w,
+        mean_value=[args.mean_value_B, args.mean_value_G, args.mean_value_R])
+    eval(
+        args,
+        test_list=args.test_list,
+        data_args=data_args,
+        batch_size=args.batch_size,
+        model_dir=args.model_dir)
diff --git a/fluid/object_detection/image_util.py b/fluid/object_detection/image_util.py
index 781932293e57c715d15f9b26ceec345b6b81cd26..4ce53048b9f8117e937411392531eeb4090fcb67 100644
--- a/fluid/object_detection/image_util.py
+++ b/fluid/object_detection/image_util.py
@@ -85,8 +85,7 @@ def satisfy_sample_constraint(sampler, sample_bbox, bbox_labels):
     return False
 
 
-def generate_batch_samples(batch_sampler, bbox_labels, image_width,
-                           image_height):
+def generate_batch_samples(batch_sampler, bbox_labels):
     sampled_bbox = []
     index = []
     c = 0
@@ -216,9 +215,9 @@ def distort_image(img, settings):
 
 def expand_image(img, bbox_labels, img_width, img_height, settings):
     prob = random.uniform(0, 1)
-    if prob < settings._hue_prob:
-        expand_ratio = random.uniform(1, settings._expand_max_ratio)
-        if expand_ratio - 1 >= 0.01:
+    if prob < settings._expand_prob:
+        if settings._expand_max_ratio - 1 >= 0.01:
+            expand_ratio = random.uniform(1, settings._expand_max_ratio)
             height = int(img_height * expand_ratio)
             width = int(img_width * expand_ratio)
             h_off = math.floor(random.uniform(0, height - img_height))
@@ -231,5 +230,5 @@ def expand_image(img, bbox_labels, img_width, img_height, settings):
             expand_img = Image.fromarray(expand_img)
             expand_img.paste(img, (int(w_off), int(h_off)))
             bbox_labels = transform_labels(bbox_labels, expand_bbox)
-            return expand_img, bbox_labels
-    return img, bbox_labels
+            return expand_img, bbox_labels, width, height
+    return img, bbox_labels, img_width, img_height
diff --git a/fluid/object_detection/load_model.py b/fluid/object_detection/load_model.py
deleted file mode 100644
index 8c7389efea33699b2f90243311ff89747f831d06..0000000000000000000000000000000000000000
--- a/fluid/object_detection/load_model.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import numpy as np
-
-
-# From npy
-def load_vars():
-    vars = {}
-    name_map = {}
-    with open('./ssd_mobilenet_v1_coco/names.map', 'r') as map_file:
-        for param in map_file:
-            fd_name, tf_name = param.strip().split('\t')
-            name_map[fd_name] = tf_name
-
-    tf_vars = np.load(
-        './ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco_2017_11_17.npy').item()
-    for fd_name in name_map:
-        tf_name = name_map[fd_name]
-        tf_var = tf_vars[tf_name]
-        if len(tf_var.shape) == 4 and 'depthwise' in tf_name:
-            vars[fd_name] = np.transpose(tf_var, (2, 3, 0, 1))
-        elif len(tf_var.shape) == 4:
-            vars[fd_name] = np.transpose(tf_var, (3, 2, 0, 1))
-        else:
-            vars[fd_name] = tf_var
-
-    return vars
-
-
-def load_and_set_vars(place):
-    vars = load_vars()
-    for k, v in vars.items():
-        t = fluid.global_scope().find_var(k).get_tensor()
-        #print(np.array(t).shape, v.shape, k)
-        assert np.array(t).shape == v.shape
-        t.set(v, place)
-
-
-# From Paddle V1
-def load_paddlev1_vars(place):
-    vars = {}
-    name_map = {}
-    with open('./caffe2paddle/names.map', 'r') as map_file:
-        for param in map_file:
-            fd_name, tf_name = param.strip().split('\t')
-            name_map[fd_name] = tf_name
-
-    from operator import mul
-
-    def load(file_name, shape):
-        with open(file_name, 'rb') as f:
-            f.read(16)
-            arr = np.fromfile(f, dtype=np.float32)
-            #print(arr.size, reduce(mul, shape), file_name)
-            assert arr.size == reduce(mul, shape)
-            return arr.reshape(shape)
-
-    for fd_name in name_map:
-        v1_name = name_map[fd_name]
-        t = fluid.global_scope().find_var(fd_name).get_tensor()
-        shape = np.array(t).shape
-        v1_var = load('./caffe2paddle/' + v1_name, shape)
-        t.set(v1_var, place)
-
-
-if __name__ == "__main__":
-    load_vars()
diff --git a/fluid/object_detection/mobilenet_ssd.py b/fluid/object_detection/mobilenet_ssd.py
index 21869647aa261a1baacbe934453c8af4416b75b7..c39883196056aede5d410554e14a0198e540d754 100644
--- a/fluid/object_detection/mobilenet_ssd.py
+++ b/fluid/object_detection/mobilenet_ssd.py
@@ -27,12 +27,7 @@ def conv_bn(input,
         bias_attr=False)
     parameter_attr = ParamAttr(learning_rate=0.1, initializer=MSRA())
     bias_attr = ParamAttr(learning_rate=0.2)
-    return fluid.layers.batch_norm(
-        input=conv,
-        act=act,
-        epsilon=0.00001,
-        param_attr=parameter_attr,
-        bias_attr=bias_attr)
+    return fluid.layers.batch_norm(input=conv, act=act)
 
 
 def depthwise_separable(input, num_filters1, num_filters2, num_groups, stride,
@@ -76,7 +71,7 @@ def extra_block(input, num_filters1, num_filters2, num_groups, stride, scale):
     return normal_conv
 
 
-def mobile_net(img, img_shape, scale=1.0):
+def mobile_net(num_classes, img, img_shape, scale=1.0):
     # 300x300
     tmp = conv_bn(img, 3, int(32 * scale), 2, 1, 3)
     # 150x150
@@ -104,10 +99,11 @@ def mobile_net(img, img_shape, scale=1.0):
     module16 = extra_block(module15, 128, 256, 1, 2, scale)
     # 2x2
     module17 = extra_block(module16, 64, 128, 1, 2, scale)
+
     mbox_locs, mbox_confs, box, box_var = fluid.layers.multi_box_head(
         inputs=[module11, module13, module14, module15, module16, module17],
         image=img,
-        num_classes=21,
+        num_classes=num_classes,
         min_ratio=20,
         max_ratio=90,
         min_sizes=[60.0, 105.0, 150.0, 195.0, 240.0, 285.0],
diff --git a/fluid/object_detection/pretrained/download_coco.sh b/fluid/object_detection/pretrained/download_coco.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1cd1836f7c6e32f9f308a0c9a29d10efbc6f183f
--- /dev/null
+++ b/fluid/object_detection/pretrained/download_coco.sh
@@ -0,0 +1,8 @@
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd "$DIR"
+
+# Download the data.
+echo "Downloading..."
+wget http://paddlemodels.bj.bcebos.com/ssd_mobilenet_v1_coco.tar.gz
+echo "Extractint..."
+tar -xf ssd_mobilenet_v1_coco.tar.gz
diff --git a/fluid/object_detection/pretrained/download_imagenet.sh b/fluid/object_detection/pretrained/download_imagenet.sh
new file mode 100755
index 0000000000000000000000000000000000000000..eb7c6767d9f9585342c2ba89a2f28f070d1351c2
--- /dev/null
+++ b/fluid/object_detection/pretrained/download_imagenet.sh
@@ -0,0 +1,8 @@
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd "$DIR"
+
+# Download the data.
+echo "Downloading..."
+wget http://paddlemodels.bj.bcebos.com/mobilenet_v1_imagenet.tar.gz
+echo "Extractint..."
+tar -xf mobilenet_v1_imagenet.tar.gz
diff --git a/fluid/object_detection/reader.py b/fluid/object_detection/reader.py
index 6a6beb6e50f5b0a7f6b969ca53868178db2527a6..78efcc4a517001023c72c9d82c6972d60e830c6c 100644
--- a/fluid/object_detection/reader.py
+++ b/fluid/object_detection/reader.py
@@ -16,19 +16,33 @@ import image_util
 from paddle.utils.image_util import *
 import random
 from PIL import Image
+from PIL import ImageDraw
 import numpy as np
 import xml.etree.ElementTree
 import os
+import time
+import copy
 
 
 class Settings(object):
-    def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value,
-                 apply_distort, apply_expand):
+    def __init__(self,
+                 dataset=None,
+                 data_dir=None,
+                 label_file=None,
+                 resize_h=300,
+                 resize_w=300,
+                 mean_value=[127.5, 127.5, 127.5],
+                 apply_distort=True,
+                 apply_expand=True,
+                 toy=0):
+        self._dataset = dataset
+        self._toy = toy
         self._data_dir = data_dir
-        self._label_list = []
-        label_fpath = os.path.join(data_dir, label_file)
-        for line in open(label_fpath):
-            self._label_list.append(line.strip())
+        if dataset == "pascalvoc":
+            self._label_list = []
+            label_fpath = os.path.join(data_dir, label_file)
+            for line in open(label_fpath):
+                self._label_list.append(line.strip())
 
         self._apply_distort = apply_distort
         self._apply_expand = apply_expand
@@ -47,6 +61,14 @@ class Settings(object):
         self._brightness_prob = 0.5
         self._brightness_delta = 0.125
 
+    @property
+    def dataset(self):
+        return self._dataset
+
+    @property
+    def toy(self):
+        return self._toy
+
     @property
     def apply_distort(self):
         return self._apply_expand
@@ -59,6 +81,10 @@ class Settings(object):
     def data_dir(self):
         return self._data_dir
 
+    @data_dir.setter
+    def data_dir(self, data_dir):
+        self._data_dir = data_dir
+
     @property
     def label_list(self):
         return self._label_list
@@ -76,133 +102,249 @@ class Settings(object):
         return self._img_mean
 
 
-def _reader_creator(settings, file_list, mode, shuffle):
+def preprocess(img, bbox_labels, mode, settings):
+    img_width, img_height = img.size
+    sampled_labels = bbox_labels
+    if mode == 'train':
+        if settings._apply_distort:
+            img = image_util.distort_image(img, settings)
+        if settings._apply_expand:
+            img, bbox_labels, img_width, img_height = image_util.expand_image(
+                img, bbox_labels, img_width, img_height, settings)
+        # sampling
+        batch_sampler = []
+        # hard-code here
+        batch_sampler.append(
+            image_util.sampler(1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0))
+        batch_sampler.append(
+            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0))
+        batch_sampler.append(
+            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0))
+        batch_sampler.append(
+            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0))
+        batch_sampler.append(
+            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0))
+        batch_sampler.append(
+            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0))
+        batch_sampler.append(
+            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0))
+        sampled_bbox = image_util.generate_batch_samples(batch_sampler,
+                                                         bbox_labels)
+
+        img = np.array(img)
+        if len(sampled_bbox) > 0:
+            idx = int(random.uniform(0, len(sampled_bbox)))
+            img, sampled_labels = image_util.crop_image(
+                img, bbox_labels, sampled_bbox[idx], img_width, img_height)
+
+        img = Image.fromarray(img)
+    img = img.resize((settings.resize_w, settings.resize_h), Image.ANTIALIAS)
+    img = np.array(img)
+
+    if mode == 'train':
+        mirror = int(random.uniform(0, 2))
+        if mirror == 1:
+            img = img[:, ::-1, :]
+            for i in xrange(len(sampled_labels)):
+                tmp = sampled_labels[i][1]
+                sampled_labels[i][1] = 1 - sampled_labels[i][3]
+                sampled_labels[i][3] = 1 - tmp
+    # HWC to CHW
+    if len(img.shape) == 3:
+        img = np.swapaxes(img, 1, 2)
+        img = np.swapaxes(img, 1, 0)
+    # RBG to BGR
+    img = img[[2, 1, 0], :, :]
+    img = img.astype('float32')
+    img -= settings.img_mean
+    img = img * 0.007843
+    return img, sampled_labels
+
+
+def coco(settings, file_list, mode, shuffle):
+    # cocoapi
+    from pycocotools.coco import COCO
+    from pycocotools.cocoeval import COCOeval
+
+    coco = COCO(file_list)
+    image_ids = coco.getImgIds()
+    images = coco.loadImgs(image_ids)
+    category_ids = coco.getCatIds()
+    category_names = [item['name'] for item in coco.loadCats(category_ids)]
+
+    if not settings.toy == 0:
+        images = images[:settings.toy] if len(images) > settings.toy else images
+    print("{} on {} with {} images".format(mode, settings.dataset, len(images)))
+
     def reader():
-        with open(file_list) as flist:
-            lines = [line.strip() for line in flist]
-            if shuffle:
-                random.shuffle(lines)
-            for line in lines:
-                if mode == 'train' or mode == 'test':
-                    img_path, label_path = line.split()
-                    img_path = os.path.join(settings.data_dir, img_path)
-                    label_path = os.path.join(settings.data_dir, label_path)
-                elif mode == 'infer':
-                    img_path = os.path.join(settings.data_dir, line)
-
-                img = Image.open(img_path)
-                img_width, img_height = img.size
-
-                # layout: label | xmin | ymin | xmax | ymax | difficult
-                if mode == 'train' or mode == 'test':
-                    bbox_labels = []
-                    root = xml.etree.ElementTree.parse(label_path).getroot()
-                    for object in root.findall('object'):
-                        bbox_sample = []
-                        # start from 1
-                        bbox_sample.append(
-                            float(
-                                settings.label_list.index(
-                                    object.find('name').text)))
-                        bbox = object.find('bndbox')
-                        difficult = float(object.find('difficult').text)
-                        bbox_sample.append(
-                            float(bbox.find('xmin').text) / img_width)
-                        bbox_sample.append(
-                            float(bbox.find('ymin').text) / img_height)
-                        bbox_sample.append(
-                            float(bbox.find('xmax').text) / img_width)
-                        bbox_sample.append(
-                            float(bbox.find('ymax').text) / img_height)
-                        bbox_sample.append(difficult)
-                        bbox_labels.append(bbox_sample)
-
-                    sample_labels = bbox_labels
-                    if mode == 'train':
-                        if settings._apply_distort:
-                            img = image_util.distort_image(img, settings)
-                        if settings._apply_expand:
-                            img, bbox_labels = image_util.expand_image(
-                                img, bbox_labels, img_width, img_height,
-                                settings)
-                        batch_sampler = []
-                        # hard-code here
-                        batch_sampler.append(
-                            image_util.sampler(1, 1, 1.0, 1.0, 1.0, 1.0, 0.0,
-                                               0.0))
-                        batch_sampler.append(
-                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.1,
-                                               0.0))
-                        batch_sampler.append(
-                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.3,
-                                               0.0))
-                        batch_sampler.append(
-                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.5,
-                                               0.0))
-                        batch_sampler.append(
-                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.7,
-                                               0.0))
-                        batch_sampler.append(
-                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.9,
-                                               0.0))
-                        batch_sampler.append(
-                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0,
-                                               1.0))
-                        """ random crop """
-                        sampled_bbox = image_util.generate_batch_samples(
-                            batch_sampler, bbox_labels, img_width, img_height)
-
-                        img = np.array(img)
-                        if len(sampled_bbox) > 0:
-                            idx = int(random.uniform(0, len(sampled_bbox)))
-                            img, sample_labels = image_util.crop_image(
-                                img, bbox_labels, sampled_bbox[idx], img_width,
-                                img_height)
-
-                        img = Image.fromarray(img)
-                img = img.resize((settings.resize_w, settings.resize_h),
-                                 Image.ANTIALIAS)
-                img = np.array(img)
-
-                if mode == 'train':
-                    mirror = int(random.uniform(0, 2))
-                    if mirror == 1:
-                        img = img[:, ::-1, :]
-                        for i in xrange(len(sample_labels)):
-                            tmp = sample_labels[i][1]
-                            sample_labels[i][1] = 1 - sample_labels[i][3]
-                            sample_labels[i][3] = 1 - tmp
-
-                if len(img.shape) == 3:
-                    img = np.swapaxes(img, 1, 2)
-                    img = np.swapaxes(img, 1, 0)
-
-                img = img[[2, 1, 0], :, :]
-                img = img.astype('float32')
-                img -= settings.img_mean
-                img = img.flatten()
-                img = img * 0.007843
-
-                sample_labels = np.array(sample_labels)
-                if mode == 'train' or mode == 'test':
-                    if mode == 'train' and len(sample_labels) == 0: continue
-                    yield img.astype(
-                        'float32'
-                    ), sample_labels[:, 1:5], sample_labels[:, 0].astype(
-                        'int32'), sample_labels[:, -1].astype('int32')
-                elif mode == 'infer':
-                    yield img.astype('float32')
+        if mode == 'train' and shuffle:
+            random.shuffle(images)
+        for image in images:
+            image_name = image['file_name']
+            image_path = os.path.join(settings.data_dir, image_name)
+
+            im = Image.open(image_path)
+            if im.mode == 'L':
+                im = im.convert('RGB')
+            im_width, im_height = im.size
+
+            # layout: category_id | xmin | ymin | xmax | ymax | iscrowd |
+            # origin_coco_bbox | segmentation | area | image_id | annotation_id
+            bbox_labels = []
+            annIds = coco.getAnnIds(imgIds=image['id'])
+            anns = coco.loadAnns(annIds)
+            for ann in anns:
+                bbox_sample = []
+                # start from 1, leave 0 to background
+                bbox_sample.append(
+                    float(category_ids.index(ann['category_id'])) + 1)
+                bbox = ann['bbox']
+                xmin, ymin, w, h = bbox
+                xmax = xmin + w
+                ymax = ymin + h
+                bbox_sample.append(float(xmin) / im_width)
+                bbox_sample.append(float(ymin) / im_height)
+                bbox_sample.append(float(xmax) / im_width)
+                bbox_sample.append(float(ymax) / im_height)
+                bbox_sample.append(float(ann['iscrowd']))
+                bbox_labels.append(bbox_sample)
+            im, sample_labels = preprocess(im, bbox_labels, mode, settings)
+            sample_labels = np.array(sample_labels)
+            if len(sample_labels) == 0: continue
+            im = im.astype('float32')
+            boxes = sample_labels[:, 1:5]
+            lbls = sample_labels[:, 0].astype('int32')
+            difficults = sample_labels[:, -1].astype('int32')
+            yield im, boxes, lbls, difficults
+
+    return reader
+
+
+def pascalvoc(settings, file_list, mode, shuffle):
+    flist = open(file_list)
+    images = [line.strip() for line in flist]
+    if not settings.toy == 0:
+        images = images[:settings.toy] if len(images) > settings.toy else images
+    print("{} on {} with {} images".format(mode, settings.dataset, len(images)))
+
+    def reader():
+        if mode == 'train' and shuffle:
+            random.shuffle(images)
+        for image in images:
+            image_path, label_path = image.split()
+            image_path = os.path.join(settings.data_dir, image_path)
+            label_path = os.path.join(settings.data_dir, label_path)
+
+            im = Image.open(image_path)
+            if im.mode == 'L':
+                im = im.convert('RGB')
+            im_width, im_height = im.size
+
+            # layout: label | xmin | ymin | xmax | ymax | difficult
+            bbox_labels = []
+            root = xml.etree.ElementTree.parse(label_path).getroot()
+            for object in root.findall('object'):
+                bbox_sample = []
+                # start from 1
+                bbox_sample.append(
+                    float(settings.label_list.index(object.find('name').text)))
+                bbox = object.find('bndbox')
+                difficult = float(object.find('difficult').text)
+                bbox_sample.append(float(bbox.find('xmin').text) / im_width)
+                bbox_sample.append(float(bbox.find('ymin').text) / im_height)
+                bbox_sample.append(float(bbox.find('xmax').text) / im_width)
+                bbox_sample.append(float(bbox.find('ymax').text) / im_height)
+                bbox_sample.append(difficult)
+                bbox_labels.append(bbox_sample)
+            im, sample_labels = preprocess(im, bbox_labels, mode, settings)
+            sample_labels = np.array(sample_labels)
+            if len(sample_labels) == 0: continue
+            im = im.astype('float32')
+            boxes = sample_labels[:, 1:5]
+            lbls = sample_labels[:, 0].astype('int32')
+            difficults = sample_labels[:, -1].astype('int32')
+            yield im, boxes, lbls, difficults
 
     return reader
 
 
+def draw_bounding_box_on_image(image,
+                               sample_labels,
+                               image_name,
+                               category_names,
+                               color='red',
+                               thickness=4,
+                               with_text=True,
+                               normalized=True):
+    image = Image.fromarray(image)
+    draw = ImageDraw.Draw(image)
+    im_width, im_height = image.size
+    if not normalized:
+        im_width, im_height = 1, 1
+    for item in sample_labels:
+        label = item[0]
+        category_name = category_names[int(label)]
+        bbox = item[1:5]
+        xmin, ymin, xmax, ymax = bbox
+        (left, right, top, bottom) = (xmin * im_width, xmax * im_width,
+                                      ymin * im_height, ymax * im_height)
+        draw.line(
+            [(left, top), (left, bottom), (right, bottom), (right, top),
+             (left, top)],
+            width=thickness,
+            fill=color)
+        if with_text:
+            if image.mode == 'RGB':
+                draw.text((left, top), category_name, (255, 255, 0))
+    image.save(image_name)
+
+
 def train(settings, file_list, shuffle=True):
-    return _reader_creator(settings, file_list, 'train', shuffle)
+    file_list = os.path.join(settings.data_dir, file_list)
+    if settings.dataset == 'coco':
+        train_settings = copy.copy(settings)
+        if '2014' in file_list:
+            sub_dir = "train2014"
+        elif '2017' in file_list:
+            sub_dir = "train2017"
+        train_settings.data_dir = os.path.join(settings.data_dir, sub_dir)
+        return coco(train_settings, file_list, 'train', shuffle)
+    else:
+        return pascalvoc(settings, file_list, 'train', shuffle)
 
 
 def test(settings, file_list):
-    return _reader_creator(settings, file_list, 'test', False)
+    file_list = os.path.join(settings.data_dir, file_list)
+    if settings.dataset == 'coco':
+        test_settings = copy.copy(settings)
+        if '2014' in file_list:
+            sub_dir = "val2014"
+        elif '2017' in file_list:
+            sub_dir = "val2017"
+        test_settings.data_dir = os.path.join(settings.data_dir, sub_dir)
+        return coco(test_settings, file_list, 'test', False)
+    else:
+        return pascalvoc(settings, file_list, 'test', False)
 
 
-def infer(settings, file_list):
-    return _reader_creator(settings, file_list, 'infer', False)
+def infer(settings, image_path):
+    def reader():
+        im = Image.open(image_path)
+        if im.mode == 'L':
+            im = im.convert('RGB')
+        im_width, im_height = im.size
+        img = img.resize((settings.resize_w, settings.resize_h),
+                         Image.ANTIALIAS)
+        img = np.array(img)
+        # HWC to CHW
+        if len(img.shape) == 3:
+            img = np.swapaxes(img, 1, 2)
+            img = np.swapaxes(img, 1, 0)
+        # RBG to BGR
+        img = img[[2, 1, 0], :, :]
+        img = img.astype('float32')
+        img -= settings.img_mean
+        img = img * 0.007843
+        yield img
+
+    return reader
diff --git a/fluid/object_detection/train.py b/fluid/object_detection/train.py
index dbd0c8d39b173558f71205f8aae16ef4d3e724b6..71fa61322d5e58e6726796463b559ccc1e584d7a 100644
--- a/fluid/object_detection/train.py
+++ b/fluid/object_detection/train.py
@@ -1,32 +1,54 @@
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import reader
-import load_model as load_model
-from mobilenet_ssd import mobile_net
-from utility import add_arguments, print_arguments
 import os
+import time
 import numpy as np
 import argparse
 import functools
+import shutil
+
+import paddle
+import paddle.fluid as fluid
+import reader
+from mobilenet_ssd import mobile_net
+from utility import add_arguments, print_arguments
 
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('parallel',    bool,   True,     "Whether use parallel training.")
-add_arg('use_gpu',     bool,   True,     "Whether use GPU.")
-# yapf: disable
+add_arg('learning_rate',    float, 0.001,     "Learning rate.")
+add_arg('batch_size',       int,   32,        "Minibatch size.")
+add_arg('num_passes',       int,   120,       "Epoch number.")
+add_arg('parallel',         bool,  True,      "Whether use parallel training.")
+add_arg('use_gpu',          bool,  True,      "Whether to use GPU or not.")
+add_arg('use_nccl',         bool,  False,     "Whether to use NCCL or not.")
+add_arg('dataset',          str, 'pascalvoc', "coco or pascalvoc.")
+add_arg('model_save_dir',   str, 'model',     "The path to save model.")
+add_arg('pretrained_model', str, 'pretrained/ssd_mobilenet_v1_coco/', "The init model path.")
+add_arg('apply_distort',    bool, True,       "Whether apply distort")
+add_arg('apply_expand',     bool, True,       "Whether appley expand")
+add_arg('ap_version',       str,  '11point',  "11point or integral")
+add_arg('resize_h',         int,  300,        "The resized image height.")
+add_arg('resize_w',         int,  300,        "The resized image width.")
+add_arg('mean_value_B',     float, 127.5,     "mean value for B channel which will be subtracted")  #123.68
+add_arg('mean_value_G',     float, 127.5,     "mean value for G channel which will be subtracted")  #116.78
+add_arg('mean_value_R',     float, 127.5,     "mean value for R channel which will be subtracted")  #103.94
+add_arg('is_toy',           int, 0, "Toy for quick debug, 0 means using all data, while n means using only n sample")
+# yapf: enable
 
 
-def train(args,
-          train_file_list,
-          val_file_list,
-          data_args,
-          learning_rate,
-          batch_size,
-          num_passes,
-          model_save_dir='model',
-          init_model_path=None):
+def parallel_do(args,
+                train_file_list,
+                val_file_list,
+                data_args,
+                learning_rate,
+                batch_size,
+                num_passes,
+                model_save_dir,
+                pretrained_model=None):
     image_shape = [3, data_args.resize_h, data_args.resize_w]
+    if data_args.dataset == 'coco':
+        num_classes = 81
+    elif data_args.dataset == 'pascalvoc':
+        num_classes = 21
 
     image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
     gt_box = fluid.layers.data(
@@ -38,47 +60,53 @@ def train(args,
 
     if args.parallel:
         places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places)
+        pd = fluid.layers.ParallelDo(places, use_nccl=args.use_nccl)
         with pd.do():
             image_ = pd.read_input(image)
             gt_box_ = pd.read_input(gt_box)
             gt_label_ = pd.read_input(gt_label)
             difficult_ = pd.read_input(difficult)
-            locs, confs, box, box_var = mobile_net(image_, image_shape)
-            loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_,
-                                         box, box_var)
+            locs, confs, box, box_var = mobile_net(num_classes, image_,
+                                                   image_shape)
+            loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_, box,
+                                         box_var)
+            nmsed_out = fluid.layers.detection_output(
+                locs, confs, box, box_var, nms_threshold=0.45)
+            loss = fluid.layers.reduce_sum(loss)
             pd.write_output(loss)
-            pd.write_output(locs)
-            pd.write_output(confs)
-            pd.write_output(box)
-            pd.write_output(box_var)
+            pd.write_output(nmsed_out)
 
-        loss, locs, confs, box, box_var = pd()
-        loss = fluid.layers.reduce_sum(loss)
+        loss, nmsed_out = pd()
+        loss = fluid.layers.mean(loss)
     else:
-        locs, confs, box, box_var = mobile_net(image, image_shape)
+        locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
         nmsed_out = fluid.layers.detection_output(
-            locs, mbox_confs, box, box_var, nms_threshold=0.45)
-        loss = fluid.layers.ssd_loss(locs, mbox_confs, gt_box, gt_label,
-                                     box, box_var)
+            locs, confs, box, box_var, nms_threshold=0.45)
+        loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box,
+                                     box_var)
         loss = fluid.layers.reduce_sum(loss)
 
     test_program = fluid.default_main_program().clone(for_test=True)
     with fluid.program_guard(test_program):
-        nmsed_out = fluid.layers.detection_output(
-            locs, confs, box, box_var, nms_threshold=0.45)
         map_eval = fluid.evaluator.DetectionMAP(
             nmsed_out,
             gt_label,
             gt_box,
             difficult,
-            21,
+            num_classes,
             overlap_threshold=0.5,
             evaluate_difficult=False,
-            ap_version='11point')
+            ap_version=args.ap_version)
 
-    boundaries = [40000, 60000]
-    values = [0.001, 0.0005, 0.00025]
+    if data_args.dataset == 'coco':
+        # learning rate decay in 12, 19 pass, respectively
+        if '2014' in train_file_list:
+            boundaries = [82783 / batch_size * 12, 82783 / batch_size * 19]
+        elif '2017' in train_file_list:
+            boundaries = [118287 / batch_size * 12, 118287 / batch_size * 19]
+    elif data_args.dataset == 'pascalvoc':
+        boundaries = [40000, 60000]
+    values = [learning_rate, learning_rate * 0.5, learning_rate * 0.25]
     optimizer = fluid.optimizer.RMSProp(
         learning_rate=fluid.layers.piecewise_decay(boundaries, values),
         regularization=fluid.regularizer.L2Decay(0.00005), )
@@ -89,8 +117,13 @@ def train(args,
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
 
-    load_model.load_and_set_vars(place)
-    #load_model.load_paddlev1_vars(place)
+    if pretrained_model:
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(pretrained_model, var.name))
+
+        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+
     train_reader = paddle.batch(
         reader.train(data_args, train_file_list), batch_size=batch_size)
     test_reader = paddle.batch(
@@ -98,50 +131,207 @@ def train(args,
     feeder = fluid.DataFeeder(
         place=place, feed_list=[image, gt_box, gt_label, difficult])
 
-    #print 'test_program ', test_program
     def test(pass_id):
         _, accum_map = map_eval.get_map_var()
         map_eval.reset(exe)
         test_map = None
-        for _, data in enumerate(test_reader()):
+        for data in test_reader():
             test_map = exe.run(test_program,
                                feed=feeder.feed(data),
                                fetch_list=[accum_map])
         print("Test {0}, map {1}".format(pass_id, test_map[0]))
 
-    #print 'main_program ', fluid.default_main_program()
     for pass_id in range(num_passes):
+        start_time = time.time()
+        prev_start_time = start_time
+        end_time = 0
         for batch_id, data in enumerate(train_reader()):
+            prev_start_time = start_time
+            start_time = time.time()
             loss_v = exe.run(fluid.default_main_program(),
                              feed=feeder.feed(data),
                              fetch_list=[loss])
+            end_time = time.time()
             if batch_id % 20 == 0:
-                print("Pass {0}, batch {1}, loss {2}"
-                      .format(pass_id, batch_id, loss_v[0]))
+                print("Pass {0}, batch {1}, loss {2}, time {3}".format(
+                    pass_id, batch_id, loss_v[0], start_time - prev_start_time))
         test(pass_id)
 
-        if pass_id % 10 == 0:
+        if pass_id % 10 == 0 or pass_id == num_passes - 1:
             model_path = os.path.join(model_save_dir, str(pass_id))
             print 'save models to %s' % (model_path)
-            fluid.io.save_inference_model(model_path, ['image'], [nmsed_out],
-                                          exe)
+            fluid.io.save_persistables(exe, model_path)
+
+
+def parallel_exe(args,
+                 train_file_list,
+                 val_file_list,
+                 data_args,
+                 learning_rate,
+                 batch_size,
+                 num_passes,
+                 model_save_dir='model',
+                 pretrained_model=None):
+    image_shape = [3, data_args.resize_h, data_args.resize_w]
+    if data_args.dataset == 'coco':
+        num_classes = 81
+    elif data_args.dataset == 'pascalvoc':
+        num_classes = 21
+
+    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
+    devices_num = len(devices.split(","))
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    gt_box = fluid.layers.data(
+        name='gt_box', shape=[4], dtype='float32', lod_level=1)
+    gt_label = fluid.layers.data(
+        name='gt_label', shape=[1], dtype='int32', lod_level=1)
+    difficult = fluid.layers.data(
+        name='gt_difficult', shape=[1], dtype='int32', lod_level=1)
+
+    locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
+    nmsed_out = fluid.layers.detection_output(
+        locs, confs, box, box_var, nms_threshold=0.45)
+    loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, box_var)
+    loss = fluid.layers.reduce_sum(loss)
+
+    test_program = fluid.default_main_program().clone(for_test=True)
+    with fluid.program_guard(test_program):
+        map_eval = fluid.evaluator.DetectionMAP(
+            nmsed_out,
+            gt_label,
+            gt_box,
+            difficult,
+            num_classes,
+            overlap_threshold=0.5,
+            evaluate_difficult=False,
+            ap_version=args.ap_version)
+
+    if data_args.dataset == 'coco':
+        # learning rate decay in 12, 19 pass, respectively
+        if '2014' in train_file_list:
+            epocs = 82783 / batch_size
+            boundaries = [epocs * 12, epocs * 19]
+        elif '2017' in train_file_list:
+            epocs = 118287 / batch_size
+            boundaries = [epcos * 12, epocs * 19]
+    elif data_args.dataset == 'pascalvoc':
+        epocs = 19200 / batch_size
+        boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100]
+    values = [
+        learning_rate, learning_rate * 0.5, learning_rate * 0.25,
+        learning_rate * 0.1, learning_rate * 0.01
+    ]
+    optimizer = fluid.optimizer.RMSProp(
+        learning_rate=fluid.layers.piecewise_decay(boundaries, values),
+        regularization=fluid.regularizer.L2Decay(0.00005), )
+
+    optimizer.minimize(loss)
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if pretrained_model:
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(pretrained_model, var.name))
+
+        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+
+    if args.parallel:
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=args.use_gpu, loss_name=loss.name)
+
+    train_reader = paddle.batch(
+        reader.train(data_args, train_file_list), batch_size=batch_size)
+    test_reader = paddle.batch(
+        reader.test(data_args, val_file_list), batch_size=batch_size)
+    feeder = fluid.DataFeeder(
+        place=place, feed_list=[image, gt_box, gt_label, difficult])
+
+    def save_model(postfix):
+        model_path = os.path.join(model_save_dir, postfix)
+        if os.path.isdir(model_path):
+            shutil.rmtree(model_path)
+        print 'save models to %s' % (model_path)
+        fluid.io.save_persistables(exe, model_path)
+
+    best_map = 0.
+
+    def test(pass_id, best_map):
+        _, accum_map = map_eval.get_map_var()
+        map_eval.reset(exe)
+        test_map = None
+        for data in test_reader():
+            test_map = exe.run(test_program,
+                               feed=feeder.feed(data),
+                               fetch_list=[accum_map])
+        if test_map[0] > best_map:
+            best_map = test_map[0]
+            save_model('best_model')
+        print("Test {0}, map {1}".format(pass_id, test_map[0]))
+
+    for pass_id in range(num_passes):
+        start_time = time.time()
+        prev_start_time = start_time
+        end_time = 0
+        for batch_id, data in enumerate(train_reader()):
+            prev_start_time = start_time
+            start_time = time.time()
+            if len(data) < devices_num: continue
+            if args.parallel:
+                loss_v, = train_exe.run(fetch_list=[loss.name],
+                                        feed_dict=feeder.feed(data))
+            else:
+                loss_v, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[loss])
+            end_time = time.time()
+            loss_v = np.mean(np.array(loss_v))
+            if batch_id % 20 == 0:
+                print("Pass {0}, batch {1}, loss {2}, time {3}".format(
+                    pass_id, batch_id, loss_v, start_time - prev_start_time))
+        test(pass_id, best_map)
+        if pass_id % 10 == 0 or pass_id == num_passes - 1:
+            save_model(str(pass_id))
+    print("Best test map {0}".format(best_map))
 
 
 if __name__ == '__main__':
     args = parser.parse_args()
     print_arguments(args)
+
+    data_dir = 'data/pascalvoc'
+    train_file_list = 'trainval.txt'
+    val_file_list = 'test.txt'
+    label_file = 'label_list'
+    model_save_dir = args.model_save_dir
+    if args.dataset == 'coco':
+        data_dir = './data/COCO17'
+        train_file_list = 'annotations/instances_train2017.json'
+        val_file_list = 'annotations/instances_val2017.json'
+        label_file = 'label_list'
+
     data_args = reader.Settings(
-        data_dir='./data',
-        label_file='label_list',
-        apply_distort=True,
-        apply_expand=True,
-        resize_h=300,
-        resize_w=300,
-        mean_value=[127.5, 127.5, 127.5])
-    train(args,
-          train_file_list='./data/trainval.txt',
-          val_file_list='./data/test.txt',
-          data_args=data_args,
-          learning_rate=0.001,
-          batch_size=32,
-          num_passes=300)
+        dataset=args.dataset,
+        data_dir=data_dir,
+        label_file=label_file,
+        apply_distort=args.apply_distort,
+        apply_expand=args.apply_expand,
+        resize_h=args.resize_h,
+        resize_w=args.resize_w,
+        mean_value=[args.mean_value_B, args.mean_value_G, args.mean_value_R],
+        toy=args.is_toy)
+    #method = parallel_do
+    method = parallel_exe
+    method(
+        args,
+        train_file_list=train_file_list,
+        val_file_list=val_file_list,
+        data_args=data_args,
+        learning_rate=args.learning_rate,
+        batch_size=args.batch_size,
+        num_passes=args.num_passes,
+        model_save_dir=model_save_dir,
+        pretrained_model=args.pretrained_model)
diff --git a/fluid/ocr_recognition/README.md b/fluid/ocr_recognition/README.md
index e71386a8e9a5c94633d31ce9bf40e26dd483fa87..7d35846fb1b67ce4fec7f364a22dce9cb853bb24 100644
--- a/fluid/ocr_recognition/README.md
+++ b/fluid/ocr_recognition/README.md
@@ -1,4 +1,179 @@
-# OCR Model
+﻿
+[toc]
 
-This model built with paddle fluid is still under active development and is not
-the final version. We welcome feedbacks.
+运行本目录下的程序示例需要使用PaddlePaddle develop最新版本。如果您的PaddlePaddle安装版本低于此要求，请按照安装文档中的说明更新PaddlePaddle安装版本。
+
+# Optical Character Recognition
+
+这里将介绍如何在PaddlePaddle fluid下使用CRNN-CTC 和 CRNN-Attention模型对图片中的文字内容进行识别。
+
+## 1. CRNN-CTC
+
+本章的任务是识别含有单行汉语字符图片，首先采用卷积将图片转为`features map`, 然后使用`im2sequence op`将`features map`转为`sequence`，经过`双向GRU RNN`得到每个step的汉语字符的概率分布。训练过程选用的损失函数为CTC loss，最终的评估指标为`instance error rate`。
+
+本路径下各个文件的作用如下：
+
+- **ctc_reader.py :** 下载、读取、处理数据。提供方法`train()` 和 `test()` 分别产生训练集和测试集的数据迭代器。
+- **crnn_ctc_model.py :** 在该脚本中定义了训练网络、预测网络和evaluate网络。
+- **ctc_train.py :** 用于模型的训练，可通过命令`python train.py --help` 获得使用方法。
+- **inference.py :** 加载训练好的模型文件，对新数据进行预测。可通过命令`python inference.py --help` 获得使用方法。
+- **eval.py :** 评估模型在指定数据集上的效果。可通过命令`python inference.py --help` 获得使用方法。
+- **utility.py :** 实现的一些通用方法，包括参数配置、tensor的构造等。
+
+
+### 1.1 数据
+
+数据的下载和简单预处理都在`ctc_reader.py`中实现。
+
+#### 1.1.1 数据格式
+
+我们使用的训练和测试数据如`图1`所示，每张图片包含单行不定长的中文字符串，这些图片都是经过检测算法进行预框选处理的。
+
+<p align="center">
+<img src="images/demo.jpg" width="620" hspace='10'/> <br/>
+<strong>图 1</strong>
+</p>
+
+在训练集中，每张图片对应的label是由若干数字组成的sequence。 Sequence中的每个数字表示一个字符在字典中的index。 `图1` 对应的label如下所示：
+```
+3835,8371,7191,2369,6876,4162,1938,168,1517,4590,3793
+```
+在上边这个label中，`3835` 表示字符‘两’的index，`4590` 表示中文字符逗号的index。
+
+
+#### 1.1.2 数据准备
+
+**A. 训练集**
+
+我们需要把所有参与训练的图片放入同一个文件夹，暂且记为`train_images`。然后用一个list文件存放每张图片的信息，包括图片大小、图片名称和对应的label，这里暂记该list文件为`train_list`，其格式如下所示：
+
+```
+185 48 00508_0215.jpg 7740,5332,2369,3201,4162
+48 48 00197_1893.jpg 6569
+338 48 00007_0219.jpg 4590,4788,3015,1994,3402,999,4553
+150 48 00107_4517.jpg 5936,3382,1437,3382
+...
+157 48 00387_0622.jpg 2397,1707,5919,1278
+```
+
+<center>文件train_list</center>
+
+上述文件中的每一行表示一张图片，每行被空格分为四列，前两列分别表示图片的宽和高，第三列表示图片的名称，第四列表示该图片对应的sequence label。
+最终我们应有以下类似文件结构：
+
+```
+|-train_data
+    |- train_list
+    |- train_imags
+        |- 00508_0215.jpg
+        |- 00197_1893.jpg
+        |- 00007_0219.jpg
+        | ...
+```
+
+在训练时，我们通过选项`--train_images` 和 `--train_list` 分别设置准备好的`train_images` 和`train_list`。
+
+
+>**注：** 如果`--train_images` 和 `--train_list`都未设置或设置为None， ctc_reader.py会自动下载使用[示例数据](http://cloud.dlnel.org/filepub/?uuid=df937251-3c0b-480d-9a7b-0080dfeee65c)，并将其缓存到`$HOME/.cache/paddle/dataset/ctc_data/data/` 路径下。
+
+
+**B. 测试集和评估集**
+
+测试集、评估集的准备方式与训练集相同。
+在训练阶段，测试集的路径通过train.py的选项`--test_images` 和 `--test_list` 来设置。
+在评估时，评估集的路径通过eval.py的选项`--input_images_dir` 和`--input_images_list` 来设置。
+
+**C. 待预测数据集**
+
+预测支持三种形式的输入：
+
+第一种：设置`--input_images_dir`和`--input_images_list`, 与训练集类似, 只不过list文件中的最后一列可以放任意占位字符或字符串，如下所示：
+
+```
+185 48 00508_0215.jpg s
+48 48 00197_1893.jpg s
+338 48 00007_0219.jpg s
+...
+```
+
+第二种：仅设置`--input_images_list`, 其中list文件中只需放图片的完整路径，如下所示：
+
+```
+data/test_images/00000.jpg
+data/test_images/00001.jpg
+data/test_images/00003.jpg
+```
+
+第三种：从stdin读入一张图片的path，然后进行一次inference.
+
+#### 1.2 训练
+
+使用默认数据在GPU单卡上训练:
+
+```
+env CUDA_VISIABLE_DEVICES=0 python ctc_train.py
+```
+
+使用默认数据在GPU多卡上训练:
+
+```
+env CUDA_VISIABLE_DEVICES=0,1,2,3 python ctc_train.py --parallel=True
+```
+
+执行`python ctc_train.py --help`可查看更多使用方式和参数详细说明。
+
+图2为使用默认参数和默认数据集训练的收敛曲线，其中横坐标轴为训练pass数，纵轴为在测试集上的sequence_error.
+
+<p align="center">
+<img src="images/train.jpg" width="620" hspace='10'/> <br/>
+<strong>图 2</strong>
+</p>
+
+
+
+### 1.3 评估
+
+通过以下命令调用评估脚本用指定数据集对模型进行评估：
+
+```
+env CUDA_VISIBLE_DEVICE=0 python eval.py \
+    --model_path="./models/model_0" \
+    --input_images_dir="./eval_data/images/" \
+    --input_images_list="./eval_data/eval_list\" \
+```
+
+执行`python ctc_train.py --help`可查看参数详细说明。
+
+
+### 1.4 预测
+
+从标准输入读取一张图片的路径，并对齐进行预测：
+
+```
+env CUDA_VISIBLE_DEVICE=0 python inference.py \
+    --model_path="models/model_00044_15000"
+```
+
+执行上述命令进行预测的效果如下：
+
+```
+-----------  Configuration Arguments -----------
+use_gpu: True
+input_images_dir: None
+input_images_list: None
+model_path: /home/work/models/fluid/ocr_recognition/models/model_00052_15000
+------------------------------------------------
+Init model from: /home/work/models/fluid/ocr_recognition/models/model_00052_15000.
+Please input the path of image: /home/work/models/fluid/ocr_recognition/data/test_images/00001_0060.jpg
+result: [3298 2371 4233 6514 2378 3298 2363]
+Please input the path of image: /home/work/models/fluid/ocr_recognition/data/test_images/00001_0429.jpg
+result: [2067 2067 8187 8477 5027 7191 2431 1462]
+```
+
+从文件中批量读取图片路径，并对其进行预测：
+
+```
+env CUDA_VISIBLE_DEVICE=0 python inference.py \
+    --model_path="models/model_00044_15000" \
+    --input_images_list="data/test.list"
+```
diff --git a/fluid/ocr_recognition/crnn_ctc_model.py b/fluid/ocr_recognition/crnn_ctc_model.py
index 53763fa34dc05a6c972683ba396becec5294b887..df33100e36e25d871db25dd304e87053dfb77145 100644
--- a/fluid/ocr_recognition/crnn_ctc_model.py
+++ b/fluid/ocr_recognition/crnn_ctc_model.py
@@ -143,7 +143,7 @@ def ctc_train_net(images, label, args, num_classes):
     gradient_clip = None
     if args.parallel:
         places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places)
+        pd = fluid.layers.ParallelDo(places, use_nccl=True)
         with pd.do():
             images_ = pd.read_input(images)
             label_ = pd.read_input(label)
diff --git a/fluid/ocr_recognition/ctc_reader.py b/fluid/ocr_recognition/ctc_reader.py
index c9f75a0d523a7390b3814706cdad831d5900dbdb..245177cb6f21849c4a5f42d65543732aa32cb6bd 100644
--- a/fluid/ocr_recognition/ctc_reader.py
+++ b/fluid/ocr_recognition/ctc_reader.py
@@ -30,10 +30,10 @@ class DataGenerator(object):
         Reader interface for training.
 
         :param img_root_dir: The root path of the image for training.
-        :type file_list: str
+        :type img_root_dir: str
 
         :param img_label_list: The path of the <image_name, label> file for training.
-        :type file_list: str
+        :type img_label_list: str
 
         '''
 
@@ -91,10 +91,10 @@ class DataGenerator(object):
         Reader interface for inference.
 
         :param img_root_dir: The root path of the images for training.
-        :type file_list: str
+        :type img_root_dir: str
 
         :param img_label_list: The path of the <image_name, label> file for testing.
-        :type file_list: list
+        :type img_label_list: str
         '''
 
         def reader():
@@ -111,6 +111,42 @@ class DataGenerator(object):
 
         return reader
 
+    def infer_reader(self, img_root_dir=None, img_label_list=None):
+        '''A reader interface for inference.
+
+        :param img_root_dir: The root path of the images for training.
+        :type img_root_dir: str
+
+        :param img_label_list: The path of the <image_name, label> file for
+        inference. It should be the path of <image_path> file if img_root_dir
+        was None. If img_label_list was set to None, it will read image path
+        from stdin.
+        :type img_root_dir: str
+        '''
+
+        def reader():
+            if img_label_list is not None:
+                for line in open(img_label_list):
+                    if img_root_dir is not None:
+                        # h, w, img_name, labels
+                        img_name = line.split(' ')[2]
+                        img_path = os.path.join(img_root_dir, img_name)
+                    else:
+                        img_path = line.strip("\t\n\r")
+                    img = Image.open(img_path).convert('L')
+                    img = np.array(img) - 127.5
+                    img = img[np.newaxis, ...]
+                    yield img, label
+            else:
+                while True:
+                    img_path = raw_input("Please input the path of image: ")
+                    img = Image.open(img_path).convert('L')
+                    img = np.array(img) - 127.5
+                    img = img[np.newaxis, ...]
+                    yield img, [[0]]
+
+        return reader
+
 
 def num_classes():
     '''Get classes number of this dataset.
@@ -124,21 +160,31 @@ def data_shape():
     return DATA_SHAPE
 
 
-def train(batch_size):
+def train(batch_size, train_images_dir=None, train_list_file=None):
     generator = DataGenerator()
-    data_dir = download_data()
-    return generator.train_reader(
-        path.join(data_dir, TRAIN_DATA_DIR_NAME),
-        path.join(data_dir, TRAIN_LIST_FILE_NAME), batch_size)
+    if train_images_dir is None:
+        data_dir = download_data()
+        train_images_dir = path.join(data_dir, TRAIN_DATA_DIR_NAME)
+    if train_list_file is None:
+        train_list_file = path.join(data_dir, TRAIN_LIST_FILE_NAME)
+    return generator.train_reader(train_images_dir, train_list_file, batch_size)
+
+
+def test(batch_size=1, test_images_dir=None, test_list_file=None):
+    generator = DataGenerator()
+    if test_images_dir is None:
+        data_dir = download_data()
+        test_images_dir = path.join(data_dir, TEST_DATA_DIR_NAME)
+    if test_list_file is None:
+        test_list_file = path.join(data_dir, TEST_LIST_FILE_NAME)
+    return paddle.batch(
+        generator.test_reader(test_images_dir, test_list_file), batch_size)
 
 
-def test(batch_size=1):
+def inference(infer_images_dir=None, infer_list_file=None):
     generator = DataGenerator()
-    data_dir = download_data()
     return paddle.batch(
-        generator.test_reader(
-            path.join(data_dir, TRAIN_DATA_DIR_NAME),
-            path.join(data_dir, TRAIN_LIST_FILE_NAME)), batch_size)
+        generator.infer_reader(infer_images_dir, infer_list_file), 1)
 
 
 def download_data():
diff --git a/fluid/ocr_recognition/ctc_train.py b/fluid/ocr_recognition/ctc_train.py
index 2ac23f609779c5e653919fece6dbf661c79e859f..35db803506179d162226ae553fa25bfd4323d567 100644
--- a/fluid/ocr_recognition/ctc_train.py
+++ b/fluid/ocr_recognition/ctc_train.py
@@ -1,61 +1,82 @@
 """Trainer for OCR CTC model."""
 import paddle.fluid as fluid
-import dummy_reader
+from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
+from crnn_ctc_model import ctc_train_net
 import ctc_reader
 import argparse
-from load_model import load_param
 import functools
 import sys
-from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
-from crnn_ctc_model import ctc_train_net
 import time
+import os
 
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('batch_size',     int,   32,     "Minibatch size.")
-add_arg('pass_num',       int,   100,     "# of training epochs.")
-add_arg('log_period',     int,   1000,   "Log period.")
-add_arg('learning_rate',  float, 1.0e-3, "Learning rate.")
-add_arg('l2',             float, 0.0004, "L2 regularizer.")
-add_arg('max_clip',       float, 10.0,   "Max clip threshold.")
-add_arg('min_clip',       float, -10.0,  "Min clip threshold.")
-add_arg('momentum',       float, 0.9,    "Momentum.")
-add_arg('rnn_hidden_size',int,   200,    "Hidden size of rnn layers.")
-add_arg('device',         int,   0,      "Device id.'-1' means running on CPU"
-                                         "while '0' means GPU-0.")
-add_arg('min_average_window',     int,   10000,     "Min average window.")
-add_arg('max_average_window',     int,   15625,     "Max average window.")
-add_arg('average_window',     float,   0.15,     "Average window.")
-add_arg('parallel',     bool,   False,     "Whether use parallel training.")
-# yapf: disable
-
-def load_parameter(place):
-    params = load_param('./name.map', './data/model/results_without_avg_window/pass-00000/')
-    for name in params:
-        t = fluid.global_scope().find_var(name).get_tensor()
-        t.set(params[name], place)
+add_arg('batch_size',        int,   32,         "Minibatch size.")
+add_arg('pass_num',          int,   100,        "Number of training epochs.")
+add_arg('log_period',        int,   1000,       "Log period.")
+add_arg('save_model_period', int,   15000,      "Save model period. '-1' means never saving the model.")
+add_arg('eval_period',       int,   15000,      "Evaluate period. '-1' means never evaluating the model.")
+add_arg('save_model_dir',    str,   "./models", "The directory the model to be saved to.")
+add_arg('init_model',        str,   None,       "The init model file of directory.")
+add_arg('learning_rate',     float, 1.0e-3,    "Learning rate.")
+add_arg('l2',                float, 0.0004,    "L2 regularizer.")
+add_arg('momentum',          float, 0.9,       "Momentum.")
+add_arg('rnn_hidden_size',   int,   200,       "Hidden size of rnn layers.")
+add_arg('use_gpu',           bool,  True,      "Whether use GPU to train.")
+add_arg('min_average_window',int,   10000,     "Min average window.")
+add_arg('max_average_window',int,   15625,     "Max average window. It is proposed to be set as the number of minibatch in a pass.")
+add_arg('average_window',    float, 0.15,      "Average window.")
+add_arg('parallel',          bool,  False,     "Whether use parallel training.")
+add_arg('train_images',      str,   None,    "The directory of training images."
+        "None means using the default training images of reader.")
+add_arg('train_list',        str,   None,    "The list file of training images."
+        "None means using the default train_list file of reader.")
+add_arg('test_images',      str,    None,    "The directory of training images."
+        "None means using the default test images of reader.")
+add_arg('test_list',        str,    None,   "The list file of training images."
+        "None means using the default test_list file of reader.")
+add_arg('num_classes',      int,    None,      "The number of classes."
+        "None means using the default num_classes from reader.")
+# yapf: enable
 
 
-def train(args, data_reader=dummy_reader):
+def train(args, data_reader=ctc_reader):
     """OCR CTC training"""
-    num_classes = data_reader.num_classes()
+    num_classes = data_reader.num_classes(
+    ) if args.num_classes is None else args.num_classes
     data_shape = data_reader.data_shape()
     # define network
     images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int32', lod_level=1)
-    sum_cost, error_evaluator, inference_program, model_average = ctc_train_net(images, label, args, num_classes)
+    label = fluid.layers.data(
+        name='label', shape=[1], dtype='int32', lod_level=1)
+    sum_cost, error_evaluator, inference_program, model_average = ctc_train_net(
+        images, label, args, num_classes)
 
     # data reader
-    train_reader = data_reader.train(args.batch_size)
-    test_reader = data_reader.test()
+    train_reader = data_reader.train(
+        args.batch_size,
+        train_images_dir=args.train_images,
+        train_list_file=args.train_list)
+    test_reader = data_reader.test(
+        test_images_dir=args.test_images, test_list_file=args.test_list)
+
     # prepare environment
     place = fluid.CPUPlace()
-    if args.device >= 0:
-        place = fluid.CUDAPlace(args.device)
+    if args.use_gpu:
+        place = fluid.CUDAPlace(0)
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
-    #load_parameter(place)
+
+    # load init model
+    if args.init_model is not None:
+        model_dir = args.init_model
+        model_file_name = None
+        if not os.path.isdir(args.init_model):
+            model_dir = os.path.dirname(args.init_model)
+            model_file_name = os.path.basename(args.init_model)
+        fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name)
+        print "Init model from: %s." % args.init_model
 
     for pass_id in range(args.pass_num):
         error_evaluator.reset(exe)
@@ -70,29 +91,41 @@ def train(args, data_reader=dummy_reader):
                 fetch_list=[sum_cost] + error_evaluator.metrics)
             total_loss += batch_loss[0]
             total_seq_error += batch_seq_error[0]
-            if batch_id % 100 == 1:
-                print '.',
-                sys.stdout.flush()
-            if batch_id % args.log_period == 1:
+            # training log
+            if batch_id % args.log_period == 0:
                 print "\nTime: %s; Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq error: %s." % (
-                    time.time(),
-                    pass_id, batch_id, total_loss / (batch_id * args.batch_size), total_seq_error / (batch_id * args.batch_size))
+                    time.time(), pass_id, batch_id,
+                    total_loss / (batch_id * args.batch_size),
+                    total_seq_error / (batch_id * args.batch_size))
                 sys.stdout.flush()
-            batch_id += 1
+            # evaluate
+            if batch_id % args.eval_period == 0:
+                with model_average.apply(exe):
+                    error_evaluator.reset(exe)
+                    for data in test_reader():
+                        exe.run(inference_program,
+                                feed=get_feeder_data(data, place))
+                    _, test_seq_error = error_evaluator.eval(exe)
 
-        with model_average.apply(exe):
-            error_evaluator.reset(exe)
-            for data in test_reader():
-                exe.run(inference_program, feed=get_feeder_data(data, place))
-            _, test_seq_error = error_evaluator.eval(exe)
+                    print "\nTime: %s; Pass[%d]-batch[%d]; Test seq error: %s.\n" % (
+                        time.time(), pass_id, batch_id, str(test_seq_error[0]))
+            # save model
+            if batch_id % args.save_model_period == 0:
+                with model_average.apply(exe):
+                    filename = "model_%05d_%d" % (pass_id, batch_id)
+                    fluid.io.save_params(
+                        exe, dirname=args.save_model_dir, filename=filename)
+                    print "Saved model to: %s/%s." % (args.save_model_dir,
+                                                      filename)
+
+            batch_id += 1
 
-            print "\nEnd pass[%d]; Test seq error: %s.\n" % (
-                pass_id, str(test_seq_error[0]))
 
 def main():
     args = parser.parse_args()
     print_arguments(args)
     train(args, data_reader=ctc_reader)
 
+
 if __name__ == "__main__":
     main()
diff --git a/fluid/ocr_recognition/dummy_reader.py b/fluid/ocr_recognition/dummy_reader.py
deleted file mode 100644
index def91b1dd95857e7df740271cac486001da5f24b..0000000000000000000000000000000000000000
--- a/fluid/ocr_recognition/dummy_reader.py
+++ /dev/null
@@ -1,52 +0,0 @@
-"""A dummy reader for test."""
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import numpy as np
-import paddle.v2 as paddle
-
-DATA_SHAPE = [1, 512, 512]
-NUM_CLASSES = 20
-
-
-def _read_creater(num_sample=1024, min_seq_len=1, max_seq_len=10):
-    def reader():
-        for i in range(num_sample):
-            sequence_len = np.random.randint(min_seq_len, max_seq_len)
-            x = np.random.uniform(0.1, 1, DATA_SHAPE).astype("float32")
-            y = np.random.randint(0, NUM_CLASSES + 1,
-                                  [sequence_len]).astype("int32")
-            yield x, y
-
-    return reader
-
-
-def train(batch_size, num_sample=128):
-    """Get train dataset reader."""
-    return paddle.batch(_read_creater(num_sample=num_sample), batch_size)
-
-
-def test(batch_size=1, num_sample=16):
-    """Get test dataset reader."""
-    return paddle.batch(_read_creater(num_sample=num_sample), batch_size)
-
-
-def data_shape():
-    """Get image shape in CHW order."""
-    return DATA_SHAPE
-
-
-def num_classes():
-    """Get number of total classes."""
-    return NUM_CLASSES
diff --git a/fluid/ocr_recognition/eval.py b/fluid/ocr_recognition/eval.py
index 342d0f16cd5f321d56988273cd6f47759e31bef0..be0a04380b62b274abfa954cbeed451afb441922 100644
--- a/fluid/ocr_recognition/eval.py
+++ b/fluid/ocr_recognition/eval.py
@@ -1,21 +1,24 @@
 import paddle.v2 as paddle
 import paddle.fluid as fluid
-from load_model import load_param
-from utility import get_feeder_data
+from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
+from crnn_ctc_model import ctc_infer
 from crnn_ctc_model import ctc_eval
 import ctc_reader
-import dummy_reader
+import argparse
+import functools
+import os
 
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('model_path',         str,  None,   "The model path to be used for inference.")
+add_arg('input_images_dir',   str,  None,   "The directory of images.")
+add_arg('input_images_list',  str,  None,   "The list file of images.")
+add_arg('use_gpu',            bool,  True,      "Whether use GPU to eval.")
+# yapf: enable
 
-def load_parameter(place):
-    params = load_param('./name.map', './data/model/results/pass-00062/')
-    for name in params:
-        print "param: %s" % name
-        t = fluid.global_scope().find_var(name).get_tensor()
-        t.set(params[name], place)
 
-
-def evaluate(eval=ctc_eval, data_reader=dummy_reader):
+def evaluate(args, eval=ctc_eval, data_reader=ctc_reader):
     """OCR inference"""
     num_classes = data_reader.num_classes()
     data_shape = data_reader.data_shape()
@@ -26,29 +29,41 @@ def evaluate(eval=ctc_eval, data_reader=dummy_reader):
     evaluator, cost = eval(images, label, num_classes)
 
     # data reader
-    test_reader = data_reader.test()
+    test_reader = data_reader.test(
+        test_images_dir=args.input_images_dir,
+        test_list_file=args.input_images_list)
+
     # prepare environment
-    place = fluid.CUDAPlace(0)
-    #place = fluid.CPUPlace()
+    place = fluid.CPUPlace()
+    if use_gpu:
+        place = fluid.CUDAPlace(0)
+
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
-    print fluid.default_main_program()
-    load_parameter(place)
+
+    # load init model
+    model_dir = args.model_path
+    model_file_name = None
+    if not os.path.isdir(args.model_path):
+        model_dir = os.path.dirname(args.model_path)
+        model_file_name = os.path.basename(args.model_path)
+    fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name)
+    print "Init model from: %s." % args.model_path
+
     evaluator.reset(exe)
     count = 0
     for data in test_reader():
         count += 1
-        print 'Process samples: %d\r' % (count, ),
-        result, avg_distance, avg_seq_error = exe.run(
-            fluid.default_main_program(),
-            feed=get_feeder_data(data, place),
-            fetch_list=[cost] + evaluator.metrics)
+        exe.run(fluid.default_main_program(), feed=get_feeder_data(data, place))
     avg_distance, avg_seq_error = evaluator.eval(exe)
-    print "avg_distance: %s; avg_seq_error: %s" % (avg_distance, avg_seq_error)
+    print "Read %d samples; avg_distance: %s; avg_seq_error: %s" % (
+        count, avg_distance, avg_seq_error)
 
 
 def main():
-    evaluate(data_reader=ctc_reader)
+    args = parser.parse_args()
+    print_arguments(args)
+    evaluate(args, data_reader=ctc_reader)
 
 
 if __name__ == "__main__":
diff --git a/fluid/ocr_recognition/images/demo.jpg b/fluid/ocr_recognition/images/demo.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..be5aee506f68861583903d04c526523afc299ab8
Binary files /dev/null and b/fluid/ocr_recognition/images/demo.jpg differ
diff --git a/fluid/ocr_recognition/images/train.jpg b/fluid/ocr_recognition/images/train.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3d691f1cd6b44c99c1b89286573daf1abd6dcbfa
Binary files /dev/null and b/fluid/ocr_recognition/images/train.jpg differ
diff --git a/fluid/ocr_recognition/inference.py b/fluid/ocr_recognition/inference.py
index 32bc59e9b04dd91e2060b55adbb6264e7797fbe5..04175bb15d7834b76818b330763054e0a519e508 100644
--- a/fluid/ocr_recognition/inference.py
+++ b/fluid/ocr_recognition/inference.py
@@ -1,47 +1,64 @@
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-from load_model import load_param
-from utility import get_feeder_data
+import paddle.fluid as fluid
+from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
 from crnn_ctc_model import ctc_infer
+import numpy as np
 import ctc_reader
-import dummy_reader
+import argparse
+import functools
+import os
 
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('model_path',         str,  None,   "The model path to be used for inference.")
+add_arg('input_images_dir',   str,  None,   "The directory of images.")
+add_arg('input_images_list',  str,  None,   "The list file of images.")
+add_arg('use_gpu',            bool,  True,      "Whether use GPU to infer.")
+# yapf: enable
 
-def load_parameter(place):
-    params = load_param('./name.map', './data/model/results/pass-00062/')
-    for name in params:
-        print "param: %s" % name
-        t = fluid.global_scope().find_var(name).get_tensor()
-        t.set(params[name], place)
 
-
-def inference(infer=ctc_infer, data_reader=dummy_reader):
+def inference(args, infer=ctc_infer, data_reader=ctc_reader):
     """OCR inference"""
     num_classes = data_reader.num_classes()
     data_shape = data_reader.data_shape()
     # define network
     images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    sequence, tmp = infer(images, num_classes)
-    fluid.layers.Print(tmp)
+    sequence = infer(images, num_classes)
     # data reader
-    test_reader = data_reader.test()
+    infer_reader = data_reader.inference(
+        infer_images_dir=args.input_images_dir,
+        infer_list_file=args.input_images_list)
     # prepare environment
-    place = fluid.CUDAPlace(0)
+    place = fluid.CPUPlace()
+    if use_gpu:
+        place = fluid.CUDAPlace(0)
+
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
 
-    load_parameter(place)
+    # load init model
+    model_dir = args.model_path
+    model_file_name = None
+    if not os.path.isdir(args.model_path):
+        model_dir = os.path.dirname(args.model_path)
+        model_file_name = os.path.basename(args.model_path)
+    fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name)
+    print "Init model from: %s." % args.model_path
 
-    for data in test_reader():
+    for data in infer_reader():
         result = exe.run(fluid.default_main_program(),
                          feed=get_feeder_data(
                              data, place, need_label=False),
-                         fetch_list=[tmp])
-        print "result: %s" % (list(result[0].flatten()), )
+                         fetch_list=[sequence],
+                         return_numpy=False)
+        print "result: %s" % (np.array(result[0]).flatten(), )
 
 
 def main():
-    inference(data_reader=ctc_reader)
+    args = parser.parse_args()
+    print_arguments(args)
+    inference(args, data_reader=ctc_reader)
 
 
 if __name__ == "__main__":
diff --git a/fluid/ocr_recognition/load_model.py b/fluid/ocr_recognition/load_model.py
deleted file mode 100644
index fea9398866f3f3c276f6e998a18c6bdd0a2a488a..0000000000000000000000000000000000000000
--- a/fluid/ocr_recognition/load_model.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import sys
-import numpy as np
-import ast
-
-
-def load_parameter(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16)  # skip header.
-        return np.fromfile(f, dtype=np.float32)
-
-
-def load_param(name_map_file, old_param_dir):
-    result = {}
-    name_map = {}
-    shape_map = {}
-    with open(name_map_file, 'r') as map_file:
-        for param in map_file:
-            old_name, new_name, shape = param.strip().split('=')
-            name_map[new_name] = old_name
-            shape_map[new_name] = ast.literal_eval(shape)
-
-    for new_name in name_map:
-        result[new_name] = load_parameter("/".join(
-            [old_param_dir, name_map[new_name]])).reshape(shape_map[new_name])
-    return result
-
-
-if __name__ == "__main__":
-    name_map_file = "./name.map"
-    old_param_dir = "./data/model/results/pass-00062/"
-    result = load_param(name_map_file, old_param_dir)
-    for p in result:
-        print "name: %s; param.shape: %s" % (p, result[p].shape)
diff --git a/fluid/policy_gradient/brain.py b/fluid/policy_gradient/brain.py
index 8387833065d89e0a61b90734771a8d9db5ac1eb4..ad556902f1f2d9b40e9ce8905373541decffa642 100644
--- a/fluid/policy_gradient/brain.py
+++ b/fluid/policy_gradient/brain.py
@@ -30,32 +30,28 @@ class PolicyGradient:
         acts = fluid.layers.data(name='acts', shape=[1], dtype='int64')
         vt = fluid.layers.data(name='vt', shape=[1], dtype='float32')
         # fc1
-        fc1 = fluid.layers.fc(
-            input=obs,
-            size=10,
-            act="tanh"  # tanh activation
-        )
+        fc1 = fluid.layers.fc(input=obs, size=10, act="tanh")  # tanh activation
         # fc2
-        self.all_act_prob = fluid.layers.fc(input=fc1,
-                                            size=self.n_actions,
-                                            act="softmax")
+        all_act_prob = fluid.layers.fc(input=fc1,
+                                       size=self.n_actions,
+                                       act="softmax")
+        self.inferece_program = fluid.defaul_main_program().clone()
         # to maximize total reward (log_p * R) is to minimize -(log_p * R)
         neg_log_prob = fluid.layers.cross_entropy(
             input=self.all_act_prob,
             label=acts)  # this is negative log of chosen action
         neg_log_prob_weight = fluid.layers.elementwise_mul(x=neg_log_prob, y=vt)
         loss = fluid.layers.reduce_mean(
-            x=neg_log_prob_weight)  # reward guided loss
+            neg_log_prob_weight)  # reward guided loss
 
         sgd_optimizer = fluid.optimizer.SGD(self.lr)
         sgd_optimizer.minimize(loss)
         self.exe.run(fluid.default_startup_program())
 
     def choose_action(self, observation):
-        prob_weights = self.exe.run(
-            fluid.default_main_program().prune(self.all_act_prob),
-            feed={"obs": observation[np.newaxis, :]},
-            fetch_list=[self.all_act_prob])
+        prob_weights = self.exe.run(self.inferece_program,
+                                    feed={"obs": observation[np.newaxis, :]},
+                                    fetch_list=[self.all_act_prob])
         prob_weights = np.array(prob_weights[0])
         action = np.random.choice(
             range(prob_weights.shape[1]),
diff --git a/fluid/text_classification/README.md b/fluid/text_classification/README.md
index 500ee6ae6db28e9d844d206a1cc894c36f1db09f..43c15934fa62af3db2261be37803ce21ba6bf946 100644
--- a/fluid/text_classification/README.md
+++ b/fluid/text_classification/README.md
@@ -1,16 +1,112 @@
-The minimum PaddlePaddle version needed for the code sample in this directory is the lastest develop branch. If you are on a version of PaddlePaddle earlier than this, [please update your installation](http://www.paddlepaddle.org/docs/develop/documentation/en/build_and_install/pip_install_en.html).
+# 文本分类
 
----
+以下是本例的简要目录结构及说明：
 
-# Text Classification
-
-## Data Preparation
-```
-wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
-tar zxf aclImdb_v1.tar.gz
+```text
+.
+├── nets.py              # 模型定义
+├── README.md            # 文档
+├── train.py             # 训练脚本
+├── infer.py             # 预测脚本
+└── utils.py             # 定义通用函数，从外部获取
 ```
 
-## Training
+
+## 简介，模型详解
+
+在PaddlePaddle v2版本[文本分类](https://github.com/PaddlePaddle/models/blob/develop/text/README.md)中对于文本分类任务有较详细的介绍，在本例中不再重复介绍。
+在模型上，我们采用了bow, cnn, lstm, gru四种常见的文本分类模型。
+
+## 训练
+
+1. 运行命令 `python train.py bow` 开始训练模型。
+    ```python
+    python train.py bow    # bow指定网络结构，可替换成cnn, lstm, gru
+    ```
+
+2. (可选）想自定义网络结构，需在[nets.py](./nets.py)中自行添加，并设置[train.py](./train.py)中的相应参数。
+    ```python
+    def train(train_reader,     # 训练数据
+        word_dict,              # 数据字典
+        network,                # 模型配置
+        use_cuda,               # 是否用GPU
+        parallel,               # 是否并行
+        save_dirname,           # 保存模型路径
+        lr=0.2,                 # 学习率大小
+        batch_size=128,         # 每个batch的样本数
+        pass_num=30):           # 训练的轮数
+    ```
+
+## 训练结果示例
+```text
+    pass_id: 0, avg_acc: 0.848040, avg_cost: 0.354073
+    pass_id: 1, avg_acc: 0.914200, avg_cost: 0.217945
+    pass_id: 2, avg_acc: 0.929800, avg_cost: 0.184302
+    pass_id: 3, avg_acc: 0.938680, avg_cost: 0.164240
+    pass_id: 4, avg_acc: 0.945120, avg_cost: 0.149150
+    pass_id: 5, avg_acc: 0.951280, avg_cost: 0.137117
+    pass_id: 6, avg_acc: 0.955360, avg_cost: 0.126434
+    pass_id: 7, avg_acc: 0.961400, avg_cost: 0.117405
+    pass_id: 8, avg_acc: 0.963560, avg_cost: 0.110070
+    pass_id: 9, avg_acc: 0.965840, avg_cost: 0.103273
+    pass_id: 10, avg_acc: 0.969800, avg_cost: 0.096314
+    pass_id: 11, avg_acc: 0.971720, avg_cost: 0.090206
+    pass_id: 12, avg_acc: 0.974800, avg_cost: 0.084970
+    pass_id: 13, avg_acc: 0.977400, avg_cost: 0.078981
+    pass_id: 14, avg_acc: 0.980000, avg_cost: 0.073685
+    pass_id: 15, avg_acc: 0.981080, avg_cost: 0.069898
+    pass_id: 16, avg_acc: 0.982080, avg_cost: 0.064923
+    pass_id: 17, avg_acc: 0.984680, avg_cost: 0.060861
+    pass_id: 18, avg_acc: 0.985840, avg_cost: 0.057095
+    pass_id: 19, avg_acc: 0.988080, avg_cost: 0.052424
+    pass_id: 20, avg_acc: 0.989160, avg_cost: 0.049059
+    pass_id: 21, avg_acc: 0.990120, avg_cost: 0.045882
+    pass_id: 22, avg_acc: 0.992080, avg_cost: 0.042140
+    pass_id: 23, avg_acc: 0.992280, avg_cost: 0.039722
+    pass_id: 24, avg_acc: 0.992840, avg_cost: 0.036607
+    pass_id: 25, avg_acc: 0.994440, avg_cost: 0.034040
+    pass_id: 26, avg_acc: 0.995000, avg_cost: 0.031501
+    pass_id: 27, avg_acc: 0.995440, avg_cost: 0.028988
+    pass_id: 28, avg_acc: 0.996240, avg_cost: 0.026639
+    pass_id: 29, avg_acc: 0.996960, avg_cost: 0.024186
 ```
-python train.py --dict_path 'aclImdb/imdb.vocab'
+
+## 预测
+1. 运行命令 `python infer.py bow_model`, 开始预测。
+    ```python
+    python infer.py bow_model     # bow_model指定需要导入的模型
+
+## 预测结果示例
+```text
+    model_path: bow_model/epoch0, avg_acc: 0.882800
+    model_path: bow_model/epoch1, avg_acc: 0.882360
+    model_path: bow_model/epoch2, avg_acc: 0.881400
+    model_path: bow_model/epoch3, avg_acc: 0.877800
+    model_path: bow_model/epoch4, avg_acc: 0.872920
+    model_path: bow_model/epoch5, avg_acc: 0.872640
+    model_path: bow_model/epoch6, avg_acc: 0.869960
+    model_path: bow_model/epoch7, avg_acc: 0.865160
+    model_path: bow_model/epoch8, avg_acc: 0.863680
+    model_path: bow_model/epoch9, avg_acc: 0.861200
+    model_path: bow_model/epoch10, avg_acc: 0.853520
+    model_path: bow_model/epoch11, avg_acc: 0.850400
+    model_path: bow_model/epoch12, avg_acc: 0.855960
+    model_path: bow_model/epoch13, avg_acc: 0.853480
+    model_path: bow_model/epoch14, avg_acc: 0.855960
+    model_path: bow_model/epoch15, avg_acc: 0.854120
+    model_path: bow_model/epoch16, avg_acc: 0.854160
+    model_path: bow_model/epoch17, avg_acc: 0.852240
+    model_path: bow_model/epoch18, avg_acc: 0.852320
+    model_path: bow_model/epoch19, avg_acc: 0.850280
+    model_path: bow_model/epoch20, avg_acc: 0.849760
+    model_path: bow_model/epoch21, avg_acc: 0.850160
+    model_path: bow_model/epoch22, avg_acc: 0.846800
+    model_path: bow_model/epoch23, avg_acc: 0.845440
+    model_path: bow_model/epoch24, avg_acc: 0.845640
+    model_path: bow_model/epoch25, avg_acc: 0.846200
+    model_path: bow_model/epoch26, avg_acc: 0.845880
+    model_path: bow_model/epoch27, avg_acc: 0.844880
+    model_path: bow_model/epoch28, avg_acc: 0.844680
+    model_path: bow_model/epoch29, avg_acc: 0.844960
 ```
+注：过拟合导致acc持续下降，请忽略
diff --git a/fluid/text_classification/config.py b/fluid/text_classification/config.py
deleted file mode 100644
index 2aba3247eb9033d959bbf4a7c3d475d5c8309058..0000000000000000000000000000000000000000
--- a/fluid/text_classification/config.py
+++ /dev/null
@@ -1,16 +0,0 @@
-class TrainConfig(object):
-
-    # Whether to use GPU in training or not.
-    use_gpu = False
-
-    # The training batch size.
-    batch_size = 4
-
-    # The epoch number.
-    num_passes = 30
-
-    # The global learning rate.
-    learning_rate = 0.01
-
-    # Training log will be printed every log_period.
-    log_period = 100
diff --git a/fluid/text_classification/infer.py b/fluid/text_classification/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2a0363d786866a92195dba8b490287b3ca9bc9d
--- /dev/null
+++ b/fluid/text_classification/infer.py
@@ -0,0 +1,50 @@
+import sys
+import time
+import unittest
+import contextlib
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+import utils
+
+
+def infer(test_reader, use_cuda, model_path=None):
+    """
+    inference function
+    """
+    if model_path is None:
+        print(str(model_path) + " cannot be found")
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+
+        total_acc = 0.0
+        total_count = 0
+        for data in test_reader():
+            acc = exe.run(inference_program,
+                          feed=utils.data2tensor(data, place),
+                          fetch_list=fetch_targets,
+                          return_numpy=True)
+            total_acc += acc[0] * len(data)
+            total_count += len(data)
+
+        avg_acc = total_acc / total_count
+        print("model_path: %s, avg_acc: %f" % (model_path, avg_acc))
+
+
+if __name__ == "__main__":
+    word_dict, train_reader, test_reader = utils.prepare_data(
+        "imdb", self_dict=False, batch_size=128, buf_size=50000)
+
+    model_path = sys.argv[1]
+    for i in range(30):
+        epoch_path = model_path + "/" + "epoch" + str(i)
+        infer(test_reader, use_cuda=False, model_path=epoch_path)
diff --git a/fluid/text_classification/nets.py b/fluid/text_classification/nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..a21742d22d0bd1676c8c5874899af746b5225636
--- /dev/null
+++ b/fluid/text_classification/nets.py
@@ -0,0 +1,124 @@
+import sys
+import time
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+
+def bow_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    bow net
+    """
+    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction
+
+
+def cnn_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            win_size=3):
+    """
+    conv net
+    """
+    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
+
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=win_size,
+        act="tanh",
+        pool_type="max")
+
+    fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
+
+    prediction = fluid.layers.fc(input=[fc_1], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction
+
+
+def lstm_net(data,
+             label,
+             dict_dim,
+             emb_dim=128,
+             hid_dim=128,
+             hid_dim2=96,
+             class_dim=2,
+             emb_lr=30.0):
+    """
+    lstm net
+    """
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4, act='tanh')
+
+    lstm_h, c = fluid.layers.dynamic_lstm(
+        input=fc0, size=hid_dim * 4, is_reverse=False)
+
+    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
+    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+
+    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
+
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction
+
+
+def gru_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            emb_lr=400.0):
+    """
+    gru net
+    """
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
+    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
+    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
+    gru_max_tanh = fluid.layers.tanh(gru_max)
+    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc, prediction
diff --git a/fluid/text_classification/train.py b/fluid/text_classification/train.py
index d32e1c4c878f4d6ef554cc27e0fc5ffc99f96a4a..dc164671e785b758365885b98788fae71d5f8a87 100644
--- a/fluid/text_classification/train.py
+++ b/fluid/text_classification/train.py
@@ -1,164 +1,131 @@
-import numpy as np
 import sys
-import os
-import argparse
 import time
+import unittest
+import contextlib
 
-import paddle.v2 as paddle
 import paddle.fluid as fluid
+import paddle.v2 as paddle
 
-from config import TrainConfig as conf
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--dict_path',
-        type=str,
-        required=True,
-        help="Path of the word dictionary.")
-    return parser.parse_args()
-
-
-# Define to_lodtensor function to process the sequential data.
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-# Load the dictionary.
-def load_vocab(filename):
-    vocab = {}
-    with open(filename) as f:
-        for idx, line in enumerate(f):
-            vocab[line.strip()] = idx
-    return vocab
-
-
-# Define the convolution model.
-def conv_net(dict_dim,
-             window_size=3,
-             emb_dim=128,
-             num_filters=128,
-             fc0_dim=96,
-             class_dim=2):
-
+import utils
+from nets import bow_net
+from nets import cnn_net
+from nets import lstm_net
+from nets import gru_net
+
+
+def train(train_reader,
+          word_dict,
+          network,
+          use_cuda,
+          parallel,
+          save_dirname,
+          lr=0.2,
+          batch_size=128,
+          pass_num=30):
+    """
+    train network
+    """
     data = fluid.layers.data(
         name="words", shape=[1], dtype="int64", lod_level=1)
 
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
-    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
-
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=num_filters,
-        filter_size=window_size,
-        act="tanh",
-        pool_type="max")
-
-    fc_0 = fluid.layers.fc(input=[conv_3], size=fc0_dim)
-
-    prediction = fluid.layers.fc(input=[fc_0], size=class_dim, act="softmax")
-
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-
-    avg_cost = fluid.layers.mean(x=cost)
-
-    return data, label, prediction, avg_cost
-
-
-def main(dict_path):
-    word_dict = load_vocab(dict_path)
-    word_dict["<unk>"] = len(word_dict)
-    dict_dim = len(word_dict)
-    print("The dictionary size is : %d" % dict_dim)
-
-    data, label, prediction, avg_cost = conv_net(dict_dim)
-
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=conf.learning_rate)
-    sgd_optimizer.minimize(avg_cost)
-
-    batch_size_var = fluid.layers.create_tensor(dtype='int64')
-    batch_acc_var = fluid.layers.accuracy(
-        input=prediction, label=label, total=batch_size_var)
-
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc_var, batch_size_var])
+    if not parallel:
+        cost, acc, prediction = network(data, label, len(word_dict))
+    else:
+        places = fluid.layers.get_places(device_count=2)
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            cost, acc, prediction = network(
+                pd.read_input(data), pd.read_input(label), len(word_dict))
 
-    # The training data set.
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=51200),
-        batch_size=conf.batch_size)
+            pd.write_output(cost)
+            pd.write_output(acc)
 
-    # The testing data set.
-    test_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.test(word_dict), buf_size=51200),
-        batch_size=conf.batch_size)
+        cost, acc = pd()
+        cost = fluid.layers.mean(cost)
+        acc = fluid.layers.mean(acc)
 
-    if conf.use_gpu:
-        place = fluid.CUDAPlace(0)
-    else:
-        place = fluid.CPUPlace()
+    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=lr)
+    sgd_optimizer.minimize(cost)
 
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
-
     feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
 
     exe.run(fluid.default_startup_program())
+    for pass_id in xrange(pass_num):
+        data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
+        for data in train_reader():
+            avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(),
+                                              feed=feeder.feed(data),
+                                              fetch_list=[cost, acc])
+            data_size = len(data)
+            total_acc += data_size * avg_acc_np
+            total_cost += data_size * avg_cost_np
+            data_count += data_size
+        avg_cost = total_cost / data_count
+
+        avg_acc = total_acc / data_count
+        print("pass_id: %d, avg_acc: %f, avg_cost: %f" %
+              (pass_id, avg_acc, avg_cost))
+
+        epoch_model = save_dirname + "/" + "epoch" + str(pass_id)
+        fluid.io.save_inference_model(epoch_model, ["words", "label"], acc, exe)
+
+
+def train_net():
+    word_dict, train_reader, test_reader = utils.prepare_data(
+        "imdb", self_dict=False, batch_size=128, buf_size=50000)
+
+    if sys.argv[1] == "bow":
+        train(
+            train_reader,
+            word_dict,
+            bow_net,
+            use_cuda=False,
+            parallel=False,
+            save_dirname="bow_model",
+            lr=0.002,
+            pass_num=30,
+            batch_size=128)
+    elif sys.argv[1] == "cnn":
+        train(
+            train_reader,
+            word_dict,
+            cnn_net,
+            use_cuda=True,
+            parallel=False,
+            save_dirname="cnn_model",
+            lr=0.01,
+            pass_num=30,
+            batch_size=4)
+    elif sys.argv[1] == "lstm":
+        train(
+            train_reader,
+            word_dict,
+            lstm_net,
+            use_cuda=True,
+            parallel=False,
+            save_dirname="lstm_model",
+            lr=0.05,
+            pass_num=30,
+            batch_size=4)
+    elif sys.argv[1] == "gru":
+        train(
+            train_reader,
+            word_dict,
+            lstm_net,
+            use_cuda=True,
+            parallel=False,
+            save_dirname="gru_model",
+            lr=0.05,
+            pass_num=30,
+            batch_size=128)
+    else:
+        print("network name cannot be found!")
+        sys.exit(1)
+
 
-    train_pass_acc_evaluator = fluid.average.WeightedAverage()
-    test_pass_acc_evaluator = fluid.average.WeightedAverage()
-
-    def test(exe):
-        test_pass_acc_evaluator.reset()
-        for batch_id, data in enumerate(test_reader()):
-            input_seq = to_lodtensor(map(lambda x: x[0], data), place)
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-            b_acc, b_size = exe.run(inference_program,
-                                    feed={"words": input_seq,
-                                          "label": y_data},
-                                    fetch_list=[batch_acc_var, batch_size_var])
-            test_pass_acc_evaluator.add(value=b_acc, weight=b_size)
-        test_acc = test_pass_acc_evaluator.eval()
-        return test_acc
-
-    total_time = 0.
-    for pass_id in xrange(conf.num_passes):
-        train_pass_acc_evaluator.reset()
-        start_time = time.time()
-        for batch_id, data in enumerate(train_reader()):
-            cost_val, acc_val, size_val = exe.run(
-                fluid.default_main_program(),
-                feed=feeder.feed(data),
-                fetch_list=[avg_cost, batch_acc_var, batch_size_var])
-            train_pass_acc_evaluator.add(value=acc_val, weight=size_val)
-            if batch_id and batch_id % conf.log_period == 0:
-                print("Pass id: %d, batch id: %d, cost: %f, pass_acc: %f" %
-                      (pass_id, batch_id, cost_val,
-                       train_pass_acc_evaluator.eval()))
-        end_time = time.time()
-        total_time += (end_time - start_time)
-        pass_test_acc = test(exe)
-        print("Pass id: %d, test_acc: %f" % (pass_id, pass_test_acc))
-    print("Total train time: %f" % (total_time))
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    main(args.dict_path)
+if __name__ == "__main__":
+    train_net()
diff --git a/fluid/text_classification/utils.py b/fluid/text_classification/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a37d8d720e1f97d7c70464ff15ef75e3ba049c32
--- /dev/null
+++ b/fluid/text_classification/utils.py
@@ -0,0 +1,108 @@
+import sys
+import time
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+import light_imdb
+import tiny_imdb
+
+
+def to_lodtensor(data, place):
+    """
+    convert to LODtensor
+    """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def load_vocab(filename):
+    """
+    load imdb vocabulary
+    """
+    vocab = {}
+    with open(filename) as f:
+        wid = 0
+        for line in f:
+            vocab[line.strip()] = wid
+            wid += 1
+    vocab["<unk>"] = len(vocab)
+    return vocab
+
+
+def data2tensor(data, place):
+    """
+    data2tensor
+    """
+    input_seq = to_lodtensor(map(lambda x: x[0], data), place)
+    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+    y_data = y_data.reshape([-1, 1])
+    return {"words": input_seq, "label": y_data}
+
+
+def prepare_data(data_type="imdb",
+                 self_dict=False,
+                 batch_size=128,
+                 buf_size=50000):
+    """
+    prepare data
+    """
+    if self_dict:
+        word_dict = load_vocab(data_type + ".vocab")
+    else:
+        if data_type == "imdb":
+            word_dict = paddle.dataset.imdb.word_dict()
+        elif data_type == "light_imdb":
+            word_dict = light_imdb.word_dict()
+        elif data_type == "tiny_imdb":
+            word_dict = tiny_imdb.word_dict()
+        else:
+            raise RuntimeError("No such dataset")
+
+    if data_type == "imdb":
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.imdb.train(word_dict), buf_size=buf_size),
+            batch_size=batch_size)
+
+        test_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.imdb.test(word_dict), buf_size=buf_size),
+            batch_size=batch_size)
+
+    elif data_type == "light_imdb":
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                light_imdb.train(word_dict), buf_size=buf_size),
+            batch_size=batch_size)
+
+        test_reader = paddle.batch(
+            paddle.reader.shuffle(
+                light_imdb.test(word_dict), buf_size=buf_size),
+            batch_size=batch_size)
+
+    elif data_type == "tiny_imdb":
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                tiny_imdb.train(word_dict), buf_size=buf_size),
+            batch_size=batch_size)
+
+        test_reader = paddle.batch(
+            paddle.reader.shuffle(
+                tiny_imdb.test(word_dict), buf_size=buf_size),
+            batch_size=batch_size)
+    else:
+        raise RuntimeError("no such dataset")
+
+    return word_dict, train_reader, test_reader