diff --git a/.gitignore b/.gitignore
index 2badc3bdaa52f2608183fa34393719be66630654..9e3a0b499f9f42856429f3a42bef313ea3df3699 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,12 +25,3 @@ third_party/
 
 # clion workspace.
 cmake-build-*
-
-# generated while compiling
-paddle/pybind/pybind.h
-CMakeFiles
-cmake_install.cmake
-paddle/.timestamp
-python/paddlepaddle.egg-info/
-paddle/fluid/pybind/pybind.h
-python/paddle/version.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 89c620bb2f7ef634fa80b64eec7037e8cb9a190c..6140340890c0e5025eb08209e8ea78df918b4dc0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,4 @@
+repos:
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
     sha: v1.0.1
     hooks:
@@ -25,6 +26,14 @@
         entry: bash ./.clang_format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
+-   repo: local
+    hooks:
+    -   id: cpplint-cpp-source
+        name: cpplint
+        description: Check C++ code style using cpplint.py.
+        entry: bash ./tools/codestyle/cpplint_pre_commit.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
     sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:
diff --git a/.travis.yml b/.travis.yml
index bf6a41d13c4eabc2d8543ab821ce0ff747a061df..929c847bd36d64e79a199b2634ebf68c3225429b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,7 +34,7 @@ addons:
       - automake
       - libtool
       - ccache
-  ssh_known_hosts: 52.76.173.135
+  ssh_known_hosts: 13.229.163.131
 before_install:
   - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
index b619613ea7a5b6e940ec735314e8e47338b2c600..64816098a524f064ec12474a736cd4c721227a70 100644
--- a/benchmark/cluster/README.md
+++ b/benchmark/cluster/README.md
@@ -36,11 +36,41 @@
 - Trainer Count: 100
 - Metrics: mini-batch / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>
 
 ### Measure the Performance for Different PServer Count
 
@@ -48,11 +78,41 @@
 - Batch Size: 64
 - Metrics: mini-batch / sec
 
-| PServer Count | 10 | 20 | 40 | 60 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+<table>
+<thead>
+<tr>
+<th>PServer Count  </th>
+<th>10</th>
+<th>20</th>
+<th>40 </th>
+<th>60</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>
 
 ### Measure Parallel Efficiency By Increasing Trainer Count
 
@@ -67,11 +127,69 @@ The parallel efficiency is:
 
 $E = \div(S, N)$
 
-| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
-| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
-| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>Trainer Counter  </th>
+<th>1</th>
+<th>10</th>
+<th>20 </th>
+<th>30</th>
+<th>40</th>
+<th>50</th>
+<th>60 </th>
+<th>70</th>
+<th>80</th>
+<th>90</th>
+<th>100 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>
+
 
 ## Reproduce the benchmark
 
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index cd681a1a282d9a26eac1c267bfa26967f8c3c9fd..d56a912b9b03986e32693363f82df05a34b779e9 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -16,11 +16,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 
 - Metrics: samples / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
-| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
-| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 15.44 </td>
+<td> 16.32 </td>
+<td> 16.74 </td>
+<td> 16.79 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td> 15.97 </td>
+<td> 17.04 </td>
+<td> 17.60 </td>
+<td> 17.83 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> 9.09 </td>
+<td> 9.10 </td>
+<td> 9.24 </td>
+<td> 8.66 </td>
+</tr>
+</tbody>
+</table>
+
 
 ### Different Batch Size
 
@@ -28,12 +58,40 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Trainer Count: 20
 - Metrics: samples / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
-| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
-| TensorFlow | - | - | - | - |
-
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 190.20 </td>
+<td> 222.15 </td>
+<td> 247.40 </td>
+<td> 258.18 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td> 170.96 </td>
+<td> 233.71 </td>
+<td> 256.14 </td>
+<td> 329.23 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
 
 ### Accelerate Rate
 
@@ -41,11 +99,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples / sec
 
-| Trainer Count | 20 | 40 | 80 | 100 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
-| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
-| TensorFlow | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>Trainer Count </th>
+<th>20</th>
+<th>40</th>
+<th>80</th>
+<th>100</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 263.29 (78.64%) </td>
+<td> 518.80 (77.47%) </td>
+<td> 836.26 (62.44%) </td>
+<td> 1019.29 (60.89%) </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2 (need more tests)   </td>
+<td> 326.85 (92.85%) </td>
+<td> 534.58 (75.93%) </td>
+<td> 853.30 (60.60%) </td>
+<td> 1041.99 (59.20%) </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
+
 
 ### Different Pserver Count
 
@@ -53,11 +141,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples/ sec
 
-| PServer Count | 3 | 6 |10 | 20 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
-| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
-| TensorFlow | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>PServer Count </th>
+<th>3</th>
+<th>6</th>
+<th>10</th>
+<th>20</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid(should fix in next PR) </td>
+<td> 589.1 </td>
+<td> 592.6 </td>
+<td> 656.4 </td>
+<td> 655.8 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2 (need more tests)   </td>
+<td> 593.4 </td>
+<td> 791.3 </td>
+<td> 729.7 </td>
+<td> 821.7 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
+
 
 *The performance gap between Fuild and v2 comes from the network interference.*
 
diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7a421c10979c3b9d6865a8c0b99a6410e0f46a8
--- /dev/null
+++ b/benchmark/fluid/machine_translation.py
@@ -0,0 +1,379 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""seq2seq model for fluid."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import distutils.util
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.executor import Executor
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--embedding_dim",
+    type=int,
+    default=512,
+    help="The dimension of embedding table. (default: %(default)d)")
+parser.add_argument(
+    "--encoder_size",
+    type=int,
+    default=512,
+    help="The size of encoder bi-rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--decoder_size",
+    type=int,
+    default=512,
+    help="The size of decoder rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--batch_size",
+    type=int,
+    default=16,
+    help="The sequence number of a mini-batch data. (default: %(default)d)")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    "--dict_size",
+    type=int,
+    default=30000,
+    help="The dictionary capacity. Dictionaries of source sequence and "
+    "target dictionary have same capacity. (default: %(default)d)")
+parser.add_argument(
+    "--pass_num",
+    type=int,
+    default=2,
+    help="The pass number to train. (default: %(default)d)")
+parser.add_argument(
+    "--learning_rate",
+    type=float,
+    default=0.0002,
+    help="Learning rate used to train the model. (default: %(default)f)")
+parser.add_argument(
+    "--infer_only", action='store_true', help="If set, run forward only.")
+parser.add_argument(
+    "--beam_size",
+    type=int,
+    default=3,
+    help="The width for beam searching. (default: %(default)d)")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    "--max_length",
+    type=int,
+    default=250,
+    help="The maximum length of sequence when doing generation. "
+    "(default: %(default)d)")
+parser.add_argument(
+    '--with_test',
+    action='store_true',
+    help='If set, test the testset during training.')
+
+
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+    def linear(inputs):
+        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+    cell_t = fluid.layers.sums(input=[
+        fluid.layers.elementwise_mul(
+            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+                x=input_gate, y=cell_tilde)
+    ])
+
+    hidden_t = fluid.layers.elementwise_mul(
+        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+    return hidden_t, cell_t
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size, max_length):
+    """Construct a seq2seq network."""
+
+    def bi_lstm_encoder(input_seq, gate_size):
+        # Linear transformation part for input gate, output gate, forget gate
+        # and cell activation vectors need be done outside of dynamic_lstm.
+        # So the output size is 4 times of gate_size.
+        input_forward_proj = fluid.layers.fc(input=input_seq,
+                                             size=gate_size * 4,
+                                             act=None,
+                                             bias_attr=False)
+        forward, _ = fluid.layers.dynamic_lstm(
+            input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
+        input_reversed_proj = fluid.layers.fc(input=input_seq,
+                                              size=gate_size * 4,
+                                              act=None,
+                                              bias_attr=False)
+        reversed, _ = fluid.layers.dynamic_lstm(
+            input=input_reversed_proj,
+            size=gate_size * 4,
+            is_reverse=True,
+            use_peepholes=False)
+        return forward, reversed
+
+    src_word_idx = fluid.layers.data(
+        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    src_embedding = fluid.layers.embedding(
+        input=src_word_idx,
+        size=[source_dict_dim, embedding_dim],
+        dtype='float32')
+
+    src_forward, src_reversed = bi_lstm_encoder(
+        input_seq=src_embedding, gate_size=encoder_size)
+
+    encoded_vector = fluid.layers.concat(
+        input=[src_forward, src_reversed], axis=1)
+
+    encoded_proj = fluid.layers.fc(input=encoded_vector,
+                                   size=decoder_size,
+                                   bias_attr=False)
+
+    backward_first = fluid.layers.sequence_pool(
+        input=src_reversed, pool_type='first')
+
+    decoder_boot = fluid.layers.fc(input=backward_first,
+                                   size=decoder_size,
+                                   bias_attr=False,
+                                   act='tanh')
+
+    def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
+                                    decoder_boot, decoder_size):
+        def simple_attention(encoder_vec, encoder_proj, decoder_state):
+            decoder_state_proj = fluid.layers.fc(input=decoder_state,
+                                                 size=decoder_size,
+                                                 bias_attr=False)
+            decoder_state_expand = fluid.layers.sequence_expand(
+                x=decoder_state_proj, y=encoder_proj)
+            concated = fluid.layers.concat(
+                input=[encoder_proj, decoder_state_expand], axis=1)
+            attention_weights = fluid.layers.fc(input=concated,
+                                                size=1,
+                                                act='tanh',
+                                                bias_attr=False)
+            attention_weights = fluid.layers.sequence_softmax(
+                input=attention_weights)
+            weigths_reshape = fluid.layers.reshape(
+                x=attention_weights, shape=[-1])
+            scaled = fluid.layers.elementwise_mul(
+                x=encoder_vec, y=weigths_reshape, axis=0)
+            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+            return context
+
+        rnn = fluid.layers.DynamicRNN()
+
+        cell_init = fluid.layers.fill_constant_batch_size_like(
+            input=decoder_boot,
+            value=0.0,
+            shape=[-1, decoder_size],
+            dtype='float32')
+        cell_init.stop_gradient = False
+
+        with rnn.block():
+            current_word = rnn.step_input(target_embedding)
+            encoder_vec = rnn.static_input(encoder_vec)
+            encoder_proj = rnn.static_input(encoder_proj)
+            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+            cell_mem = rnn.memory(init=cell_init)
+            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
+            decoder_inputs = fluid.layers.concat(
+                input=[context, current_word], axis=1)
+            h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
+            rnn.update_memory(hidden_mem, h)
+            rnn.update_memory(cell_mem, c)
+            out = fluid.layers.fc(input=h,
+                                  size=target_dict_dim,
+                                  bias_attr=True,
+                                  act='softmax')
+            rnn.output(out)
+        return rnn()
+
+    if not is_generating:
+        trg_word_idx = fluid.layers.data(
+            name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+        trg_embedding = fluid.layers.embedding(
+            input=trg_word_idx,
+            size=[target_dict_dim, embedding_dim],
+            dtype='float32')
+
+        prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
+                                                 encoded_proj, decoder_boot,
+                                                 decoder_size)
+        label = fluid.layers.data(
+            name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
+
+        return avg_cost, feeding_list
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    lod_t = core.LoDTensor()
+    lod_t.set(flattened_data, place)
+    lod_t.set_lod([lod])
+    return lod_t, lod[-1]
+
+
+def lodtensor_to_ndarray(lod_tensor):
+    dims = lod_tensor.get_dims()
+    ndarray = np.zeros(shape=dims).astype('float32')
+    for i in xrange(np.product(dims)):
+        ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+    return ndarray
+
+
+def train():
+    avg_cost, feeding_list = seq_to_seq_net(
+        args.embedding_dim,
+        args.encoder_size,
+        args.decoder_size,
+        args.dict_size,
+        args.dict_size,
+        False,
+        beam_size=args.beam_size,
+        max_length=args.max_length)
+
+    # clone from default main program
+    inference_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    train_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    def do_validation():
+        total_loss = 0.0
+        count = 0
+        for batch_id, data in enumerate(test_batch_generator()):
+            src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0]
+            trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0]
+            lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0]
+
+            fetch_outs = exe.run(inference_program,
+                                 feed={
+                                     feeding_list[0]: src_seq,
+                                     feeding_list[1]: trg_seq,
+                                     feeding_list[2]: lbl_seq
+                                 },
+                                 fetch_list=[avg_cost],
+                                 return_numpy=False)
+
+            total_loss += lodtensor_to_ndarray(fetch_outs[0])[0]
+            count += 1
+
+        return total_loss / count
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in xrange(args.pass_num):
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_batch_generator()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
+            num_samples += word_num
+            trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
+            num_samples += word_num
+            lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
+
+            fetch_outs = exe.run(framework.default_main_program(),
+                                 feed={
+                                     feeding_list[0]: src_seq,
+                                     feeding_list[1]: trg_seq,
+                                     feeding_list[2]: lbl_seq
+                                 },
+                                 fetch_list=[avg_cost])
+
+            iters += 1
+            loss = np.array(fetch_outs[0])
+            print(
+                "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        if args.with_test:
+            test_loss = do_validation()
+        exit(0)
+
+
+def infer():
+    pass
+
+
+def print_arguments(args):
+    print('----------- seq2seq Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    if args.infer_only:
+        infer()
+    else:
+        train()
diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..43866da9cb113e9d49fc1c51f67da94cbc6bfd8e
--- /dev/null
+++ b/benchmark/fluid/mnist.py
@@ -0,0 +1,227 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+SEED = 1
+DTYPE = "float32"
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("mnist model benchmark.")
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=35, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=5, help='The number of passes.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    args = parser.parse_args()
+    return args
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    test_pass_acc = fluid.average.WeightedAverage()
+    for batch_id, data in enumerate(test_reader()):
+        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
+                                data)).astype(DTYPE)
+        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+        y_data = y_data.reshape([len(y_data), 1])
+
+        acc, weight = exe.run(inference_program,
+                              feed={"pixel": img_data,
+                                    "label": y_data},
+                              fetch_list=[batch_acc, batch_size_tensor])
+        test_pass_acc.add(value=acc, weight=weight)
+        pass_acc = test_pass_acc.eval()
+    return pass_acc
+
+
+def run_benchmark(model, args):
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+    start_time = time.time()
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+    opt.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+
+    accuracy = fluid.average.WeightedAverage()
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            img_data = np.array(
+                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([len(y_data), 1])
+
+            outs = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+            accuracy.add(value=outs[1], weight=outs[2])
+            iters += 1
+            num_samples += len(y_data)
+            loss = np.array(outs[0])
+            acc = np.array(outs[1])
+            train_losses.append(loss)
+            train_accs.append(acc)
+            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
+                  (pass_id, iters, loss, acc))
+
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        if args.with_test:
+            test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
+                                     inference_program)
+        exit(0)
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('----------- mnist Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if args.use_nvprof and args.device == 'GPU':
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            run_benchmark(cnn_model, args)
+    else:
+        run_benchmark(cnn_model, args)
diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1af5eaf6b46be47cb6b778cedcf53830c201ef39
--- /dev/null
+++ b/benchmark/fluid/resnet.py
@@ -0,0 +1,313 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import functools
+import numpy as np
+import time
+
+import cProfile, pstats, StringIO
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Convolution model benchmark.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=['resnet_imagenet', 'resnet_cifar10'],
+        default='resnet_imagenet',
+        help='The model architecture.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='use real data or fake data')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    args = parser.parse_args()
+    return args
+
+
+def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride):
+    short = shortcut(input, ch_out, stride)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride):
+    short = shortcut(input, ch_out * 4, stride)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+
+
+def run_benchmark(model, args):
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+    else:
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+
+    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    predict = model(input, class_dim)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+    opts = optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    def test(exe):
+        test_accuracy = fluid.average.WeightedAverage()
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(dshape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            acc, weight = exe.run(inference_program,
+                                  feed={"data": img_data,
+                                        "label": y_data},
+                                  fetch_list=[batch_acc, batch_size_tensor])
+            test_accuracy.add(value=acc, weight=weight)
+
+        return test_accuracy.eval()
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    accuracy = fluid.average.WeightedAverage()
+    if args.use_fake_data:
+        data = train_reader().next()
+        image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
+            'float32')
+        label = np.array(map(lambda x: x[1], data)).astype('int64')
+        label = label.reshape([-1, 1])
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            if not args.use_fake_data:
+                image = np.array(map(lambda x: x[0].reshape(dshape),
+                                     data)).astype('float32')
+                label = np.array(map(lambda x: x[1], data)).astype('int64')
+                label = label.reshape([-1, 1])
+            loss, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={'data': image,
+                      'label': label},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+            iters += 1
+            num_samples += len(label)
+            accuracy.add(value=acc, weight=weight)
+            train_losses.append(loss)
+            train_accs.append(acc)
+            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
+                  (pass_id, iters, loss, acc))
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        exit(0)
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('----------- resnet Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    model_map = {
+        'resnet_imagenet': resnet_imagenet,
+        'resnet_cifar10': resnet_cifar10
+    }
+    args = parse_args()
+    print_arguments(args)
+    if args.data_format == 'NHWC':
+        raise ValueError('Only support NCHW data_format now.')
+    if args.use_nvprof and args.device == 'GPU':
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            run_benchmark(model_map[args.model], args)
+    else:
+        run_benchmark(model_map[args.model], args)
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f6dfd20bf2ee0b668b6d4238d4511253b2233035
--- /dev/null
+++ b/benchmark/fluid/run.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# This script benchmarking the PaddlePaddle Fluid on
+# single thread single GPU.
+
+#export FLAGS_fraction_of_gpu_memory_to_use=0.0
+export CUDNN_PATH=/paddle/cudnn_v5
+
+# disable openmp and mkl parallel
+#https://github.com/PaddlePaddle/Paddle/issues/7199
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+        export OMP_DYNAMIC="FALSE"
+    fi
+else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+fi
+# disable multi-gpu if have more than one
+export CUDA_VISIBLE_DEVICES=0
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
+
+# only query the gpu used
+nohup stdbuf -oL nvidia-smi \
+      --id=${CUDA_VISIBLE_DEVICES} \
+      --query-gpu=timestamp \
+      --query-compute-apps=pid,process_name,used_memory \
+      --format=csv \
+      --filename=mem.log  \
+      -l 1 &
+# mnist
+# mnist gpu mnist 128
+FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=500 \
+               2>&1 | tee -a mnist_gpu_128.log
+
+# vgg16
+# gpu cifar10 128
+FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a vgg16_gpu_128.log
+
+# flowers gpu  128
+FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+               --device=GPU \
+               --batch_size=32 \
+               --data_set=flowers \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a vgg16_gpu_flowers_32.log
+
+# resnet50
+# resnet50 gpu cifar10 128
+FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+               --device=GPU \
+               --batch_size=128 \
+               --data_set=cifar10 \
+               --model=resnet_cifar10 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a resnet50_gpu_128.log
+
+# resnet50 gpu flowers 64
+FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+               --device=GPU \
+               --batch_size=64 \
+               --data_set=flowers \
+               --model=resnet_imagenet \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a resnet50_gpu_flowers_64.log
+
+# lstm
+# lstm gpu imdb 32 # tensorflow only support batch=32
+FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+               --device=GPU \
+               --batch_size=32 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               --hidden_dim=512 \
+               --emb_dim=512 \
+               --crop_size=1500 \
+               2>&1 | tee -a lstm_gpu_32.log
+
+# seq2seq
+# seq2seq gpu wmb 128
+FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a lstm_gpu_128.log
diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fcbdd64af9dc196c9d5b2b82ce4213478ea1418
--- /dev/null
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@@ -0,0 +1,236 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import cPickle
+import os
+import random
+import time
+
+import numpy
+import paddle.v2 as paddle
+import paddle.v2.dataset.imdb as imdb
+import paddle.fluid as fluid
+from paddle.v2 import batch
+import paddle.fluid.profiler as profiler
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--emb_dim',
+        type=int,
+        default=512,
+        help='Dimension of embedding table. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=512,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=100,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='CPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--crop_size',
+        type=int,
+        default=int(os.environ.get('CROP_SIZE', '1500')),
+        help='The max sentence length of input. Since this model use plain RNN,'
+        ' Gradient could be explored if sentence is too long')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    args = parser.parse_args()
+    return args
+
+
+word_dict = imdb.word_dict()
+
+
+def crop_sentence(reader, crop_size):
+    unk_value = word_dict['<unk>']
+
+    def __impl__():
+        for item in reader():
+            if len([x for x in item[0] if x != unk_value]) < crop_size:
+                yield item
+
+    return __impl__
+
+
+def main():
+    args = parse_args()
+    lstm_size = args.hidden_dim
+
+    data = fluid.layers.data(
+        name="words", shape=[1], lod_level=1, dtype='int64')
+    sentence = fluid.layers.embedding(
+        input=data, size=[len(word_dict), args.emb_dim])
+
+    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(
+                ipt,
+                hidden,
+                size, ):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            gate = fluid.layers.sums(input=[gate0, gate1])
+            return gate
+
+        forget_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        input_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        output_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        cell_gate = fluid.layers.tanh(
+            x=gate_common(word, prev_hidden, lstm_size))
+
+        cell = fluid.layers.sums(input=[
+            fluid.layers.elementwise_mul(
+                x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
+                    x=input_gate, y=cell_gate)
+        ])
+
+        hidden = fluid.layers.elementwise_mul(
+            x=output_gate, y=fluid.layers.tanh(x=cell))
+
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_pool(rnn(), 'last')
+    logit = fluid.layers.fc(input=last, size=2, act='softmax')
+    loss = fluid.layers.cross_entropy(
+        input=logit,
+        label=fluid.layers.data(
+            name='label', shape=[1], dtype='int64'))
+    loss = fluid.layers.mean(x=loss)
+
+    # add acc
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                shape=[1], dtype='int64'), total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    adam = fluid.optimizer.Adam()
+    adam.minimize(loss)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    train_reader = batch(
+        paddle.reader.shuffle(
+            crop_sentence(imdb.train(word_dict), args.crop_size),
+            buf_size=25000),
+        batch_size=args.batch_size)
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            tensor_words = to_lodtensor([x[0] for x in data], place)
+            label = numpy.array([x[1] for x in data]).astype("int64")
+            label = label.reshape((-1, 1))
+            loss_np, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={"words": tensor_words,
+                      "label": label},
+                fetch_list=[loss, batch_acc, batch_size_tensor])
+            iters += 1
+            for x in data:
+                num_samples += len(x[0])
+            print(
+                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                (pass_id, iters, loss_np, acc)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        exit(0)
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def print_arguments(args):
+    print('----------- lstm Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    main()
diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d990eff62ec368dc7033f55cc0862fa974a64e0
--- /dev/null
+++ b/benchmark/fluid/vgg.py
@@ -0,0 +1,224 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import argparse
+import functools
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NCHW',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, now only support NCHW.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+parser.add_argument(
+    '--with_test',
+    action='store_true',
+    help='If set, test the testset during training.')
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def main():
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    opts = optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    # test
+    def test(exe):
+        test_accuracy = fluid.average.WeightedAverage()
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            acc, weight = exe.run(inference_program,
+                                  feed={"pixel": img_data,
+                                        "label": y_data},
+                                  fetch_list=[batch_acc, batch_size_tensor])
+            test_accuracy.add(value=acc, weight=weight)
+        return test_accuracy.eval()
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    accuracy = fluid.average.WeightedAverage()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            loss, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+            accuracy.add(value=acc, weight=weight)
+            iters += 1
+            num_samples += len(y_data)
+            print(
+                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                (pass_id, iters, loss, acc)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        # pass_train_acc = accuracy.eval()
+        train_losses.append(loss)
+        train_accs.append(acc)
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        exit(0)
+
+
+def print_arguments():
+    print('----------- vgg Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    print_arguments()
+    main()
diff --git a/benchmark/tensorflow/machine_translation.py b/benchmark/tensorflow/machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f77dce98353af53803246be8dc61063836b7867
--- /dev/null
+++ b/benchmark/tensorflow/machine_translation.py
@@ -0,0 +1,626 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.framework import dtypes
+from tensorflow.python.layers.core import Dense
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops.rnn_cell_impl import RNNCell, BasicLSTMCell
+from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple
+from tensorflow.contrib.rnn.python.ops import core_rnn_cell
+from tensorflow.python.ops import array_ops
+from tensorflow.python.util import nest
+import tensorflow.contrib.seq2seq as seq2seq
+from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
+import numpy as np
+import os
+import argparse
+import time
+
+import paddle.v2 as paddle
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--embedding_dim",
+    type=int,
+    default=512,
+    help="The dimension of embedding table. (default: %(default)d)")
+parser.add_argument(
+    "--encoder_size",
+    type=int,
+    default=512,
+    help="The size of encoder bi-rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--decoder_size",
+    type=int,
+    default=512,
+    help="The size of decoder rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--batch_size",
+    type=int,
+    default=128,
+    help="The sequence number of a mini-batch data. (default: %(default)d)")
+parser.add_argument(
+    "--dict_size",
+    type=int,
+    default=30000,
+    help="The dictionary capacity. Dictionaries of source sequence and "
+    "target dictionary have same capacity. (default: %(default)d)")
+parser.add_argument(
+    "--max_time_steps",
+    type=int,
+    default=81,
+    help="Max number of time steps for sequence. (default: %(default)d)")
+parser.add_argument(
+    "--pass_num",
+    type=int,
+    default=10,
+    help="The pass number to train. (default: %(default)d)")
+parser.add_argument(
+    "--learning_rate",
+    type=float,
+    default=0.0002,
+    help="Learning rate used to train the model. (default: %(default)f)")
+parser.add_argument(
+    "--infer_only", action='store_true', help="If set, run forward only.")
+parser.add_argument(
+    "--beam_size",
+    type=int,
+    default=3,
+    help="The width for beam searching. (default: %(default)d)")
+parser.add_argument(
+    "--max_generation_length",
+    type=int,
+    default=250,
+    help="The maximum length of sequence when doing generation. "
+    "(default: %(default)d)")
+parser.add_argument(
+    "--save_freq",
+    type=int,
+    default=500,
+    help="Save model checkpoint every this interation. (default: %(default)d)")
+parser.add_argument(
+    "--model_dir",
+    type=str,
+    default='./checkpoint',
+    help="Path to save model checkpoints. (default: %(default)d)")
+
+_Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
+
+START_TOKEN_IDX = 0
+END_TOKEN_IDX = 1
+
+
+class LSTMCellWithSimpleAttention(RNNCell):
+    """Add attention mechanism to BasicLSTMCell.
+    This class is a wrapper based on tensorflow's `BasicLSTMCell`.
+    """
+
+    def __init__(self,
+                 num_units,
+                 encoder_vector,
+                 encoder_proj,
+                 source_sequence_length,
+                 forget_bias=1.0,
+                 state_is_tuple=True,
+                 activation=None,
+                 reuse=None):
+        super(LSTMCellWithSimpleAttention, self).__init__(_reuse=reuse)
+        if not state_is_tuple:
+            logging.warn("%s: Using a concatenated state is slower and will "
+                         "soon be deprecated. Use state_is_tuple=True.", self)
+        self._num_units = num_units
+        # set padding part to 0
+        self._encoder_vector = self._reset_padding(encoder_vector,
+                                                   source_sequence_length)
+        self._encoder_proj = self._reset_padding(encoder_proj,
+                                                 source_sequence_length)
+        self._forget_bias = forget_bias
+        self._state_is_tuple = state_is_tuple
+        self._activation = activation or math_ops.tanh
+        self._linear = None
+
+    @property
+    def state_size(self):
+        return (LSTMStateTuple(self._num_units, self._num_units) \
+                if self._state_is_tuple else 2 * self._num_units)
+
+    @property
+    def output_size(self):
+        return self._num_units
+
+    def zero_state(self, batch_size, dtype):
+        state_size = self.state_size
+        if hasattr(self, "_last_zero_state"):
+            (last_state_size, last_batch_size, last_dtype,
+             last_output) = getattr(self, "_last_zero_state")
+            if (last_batch_size == batch_size and last_dtype == dtype and
+                    last_state_size == state_size):
+                return last_output
+        with ops.name_scope(
+                type(self).__name__ + "ZeroState", values=[batch_size]):
+            output = _zero_state_tensors(state_size, batch_size, dtype)
+        self._last_zero_state = (state_size, batch_size, dtype, output)
+        return output
+
+    def call(self, inputs, state):
+        sigmoid = math_ops.sigmoid
+        # Parameters of gates are concatenated into one multiply for efficiency.
+        if self._state_is_tuple:
+            c, h = state
+        else:
+            c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
+
+        # get context from encoder outputs
+        context = self._simple_attention(self._encoder_vector,
+                                         self._encoder_proj, h)
+
+        if self._linear is None:
+            self._linear = _Linear([inputs, context, h], 4 * self._num_units,
+                                   True)
+        # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+        i, j, f, o = array_ops.split(
+            value=self._linear([inputs, context, h]),
+            num_or_size_splits=4,
+            axis=1)
+
+        new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
+                 self._activation(j))
+        new_h = self._activation(new_c) * sigmoid(o)
+
+        if self._state_is_tuple:
+            new_state = LSTMStateTuple(new_c, new_h)
+        else:
+            new_state = array_ops.concat([new_c, new_h], 1)
+        return new_h, new_state
+
+    def _simple_attention(self, encoder_vec, encoder_proj, decoder_state):
+        """Implement the attention function.
+        The implementation has the same logic to the fluid decoder.
+        """
+        decoder_state_proj = tf.contrib.layers.fully_connected(
+            inputs=decoder_state,
+            num_outputs=self._num_units,
+            activation_fn=None,
+            biases_initializer=None)
+        decoder_state_expand = tf.tile(
+            tf.expand_dims(
+                input=decoder_state_proj, axis=1),
+            [1, tf.shape(encoder_proj)[1], 1])
+        concated = tf.concat([decoder_state_expand, encoder_proj], axis=2)
+        # need reduce the first dimension
+        attention_weights = tf.contrib.layers.fully_connected(
+            inputs=tf.reshape(
+                concated, shape=[-1, self._num_units * 2]),
+            num_outputs=1,
+            activation_fn=tf.nn.tanh,
+            biases_initializer=None)
+        attention_weights_reshaped = tf.reshape(
+            attention_weights, shape=[tf.shape(encoder_vec)[0], -1, 1])
+        # normalize the attention weights using softmax
+        attention_weights_normed = tf.nn.softmax(
+            attention_weights_reshaped, dim=1)
+        scaled = tf.multiply(attention_weights_normed, encoder_vec)
+        context = tf.reduce_sum(scaled, axis=1)
+        return context
+
+    def _reset_padding(self,
+                       memory,
+                       memory_sequence_length,
+                       check_inner_dims_defined=True):
+        """Reset the padding part for encoder inputs.
+        This funtion comes from tensorflow's `_prepare_memory` function.
+        """
+        memory = nest.map_structure(
+                lambda m: ops.convert_to_tensor(m, name="memory"), memory)
+        if memory_sequence_length is not None:
+            memory_sequence_length = ops.convert_to_tensor(
+                memory_sequence_length, name="memory_sequence_length")
+        if check_inner_dims_defined:
+
+            def _check_dims(m):
+                if not m.get_shape()[2:].is_fully_defined():
+                    raise ValueError(
+                        "Expected memory %s to have fully defined inner dims, "
+                        "but saw shape: %s" % (m.name, m.get_shape()))
+
+            nest.map_structure(_check_dims, memory)
+        if memory_sequence_length is None:
+            seq_len_mask = None
+        else:
+            seq_len_mask = array_ops.sequence_mask(
+                memory_sequence_length,
+                maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
+                dtype=nest.flatten(memory)[0].dtype)
+            seq_len_batch_size = (memory_sequence_length.shape[0].value or
+                                  array_ops.shape(memory_sequence_length)[0])
+
+        def _maybe_mask(m, seq_len_mask):
+            rank = m.get_shape().ndims
+            rank = rank if rank is not None else array_ops.rank(m)
+            extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
+            m_batch_size = m.shape[0].value or array_ops.shape(m)[0]
+            if memory_sequence_length is not None:
+                message = ("memory_sequence_length and memory tensor "
+                           "batch sizes do not match.")
+                with ops.control_dependencies([
+                        check_ops.assert_equal(
+                            seq_len_batch_size, m_batch_size, message=message)
+                ]):
+                    seq_len_mask = array_ops.reshape(
+                        seq_len_mask,
+                        array_ops.concat(
+                            (array_ops.shape(seq_len_mask), extra_ones), 0))
+                return m * seq_len_mask
+            else:
+                return m
+
+        return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask),
+                                  memory)
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size,
+                   max_generation_length):
+    src_word_idx = tf.placeholder(tf.int32, shape=[None, None])
+    src_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
+
+    src_embedding_weights = tf.get_variable("source_word_embeddings",
+                                            [source_dict_dim, embedding_dim])
+    src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx)
+
+    src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
+    src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
+    # no peephole
+    encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
+        cell_fw=src_forward_cell,
+        cell_bw=src_reversed_cell,
+        inputs=src_embedding,
+        sequence_length=src_sequence_length,
+        dtype=tf.float32)
+
+    # concat the forward outputs and backward outputs
+    encoded_vec = tf.concat(encoder_outputs, axis=2)
+
+    # project the encoder outputs to size of decoder lstm
+    encoded_proj = tf.contrib.layers.fully_connected(
+        inputs=tf.reshape(
+            encoded_vec, shape=[-1, embedding_dim * 2]),
+        num_outputs=decoder_size,
+        activation_fn=None,
+        biases_initializer=None)
+    encoded_proj_reshape = tf.reshape(
+        encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size])
+
+    # get init state for decoder lstm's H
+    backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1])
+    decoder_boot = tf.contrib.layers.fully_connected(
+        inputs=tf.reshape(
+            backword_first, shape=[-1, embedding_dim]),
+        num_outputs=decoder_size,
+        activation_fn=tf.nn.tanh,
+        biases_initializer=None)
+
+    # prepare the initial state for decoder lstm
+    cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32)
+    initial_state = LSTMStateTuple(cell_init, decoder_boot)
+
+    # create decoder lstm cell
+    decoder_cell = LSTMCellWithSimpleAttention(
+        decoder_size,
+        encoded_vec
+        if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size),
+        encoded_proj_reshape if not is_generating else
+        seq2seq.tile_batch(encoded_proj_reshape, beam_size),
+        src_sequence_length if not is_generating else
+        seq2seq.tile_batch(src_sequence_length, beam_size),
+        forget_bias=0.0)
+
+    output_layer = Dense(target_dict_dim, name='output_projection')
+
+    if not is_generating:
+        trg_word_idx = tf.placeholder(tf.int32, shape=[None, None])
+        trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
+        trg_embedding_weights = tf.get_variable(
+            "target_word_embeddings", [target_dict_dim, embedding_dim])
+        trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights,
+                                               trg_word_idx)
+
+        training_helper = seq2seq.TrainingHelper(
+            inputs=trg_embedding,
+            sequence_length=trg_sequence_length,
+            time_major=False,
+            name='training_helper')
+
+        training_decoder = seq2seq.BasicDecoder(
+            cell=decoder_cell,
+            helper=training_helper,
+            initial_state=initial_state,
+            output_layer=output_layer)
+
+        # get the max length of target sequence
+        max_decoder_length = tf.reduce_max(trg_sequence_length)
+
+        decoder_outputs_train, _, _ = seq2seq.dynamic_decode(
+            decoder=training_decoder,
+            output_time_major=False,
+            impute_finished=True,
+            maximum_iterations=max_decoder_length)
+
+        decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output)
+        decoder_pred_train = tf.argmax(
+            decoder_logits_train, axis=-1, name='decoder_pred_train')
+        masks = tf.sequence_mask(
+            lengths=trg_sequence_length,
+            maxlen=max_decoder_length,
+            dtype=tf.float32,
+            name='masks')
+
+        # place holder of label sequence
+        lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None])
+
+        # compute the loss
+        loss = seq2seq.sequence_loss(
+            logits=decoder_logits_train,
+            targets=lbl_word_idx,
+            weights=masks,
+            average_across_timesteps=True,
+            average_across_batch=True)
+
+        # return feeding list and loss operator
+        return {
+            'src_word_idx': src_word_idx,
+            'src_sequence_length': src_sequence_length,
+            'trg_word_idx': trg_word_idx,
+            'trg_sequence_length': trg_sequence_length,
+            'lbl_word_idx': lbl_word_idx
+        }, loss
+    else:
+        start_tokens = tf.ones([tf.shape(src_word_idx)[0], ],
+                               tf.int32) * START_TOKEN_IDX
+        # share the same embedding weights with target word
+        trg_embedding_weights = tf.get_variable(
+            "target_word_embeddings", [target_dict_dim, embedding_dim])
+
+        inference_decoder = beam_search_decoder.BeamSearchDecoder(
+            cell=decoder_cell,
+            embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens),
+            start_tokens=start_tokens,
+            end_token=END_TOKEN_IDX,
+            initial_state=tf.nn.rnn_cell.LSTMStateTuple(
+                tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size),
+                tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)),
+            beam_width=beam_size,
+            output_layer=output_layer)
+
+        decoder_outputs_decode, _, _ = seq2seq.dynamic_decode(
+            decoder=inference_decoder,
+            output_time_major=False,
+            #impute_finished=True,# error occurs
+            maximum_iterations=max_generation_length)
+
+        predicted_ids = decoder_outputs_decode.predicted_ids
+
+        return {
+            'src_word_idx': src_word_idx,
+            'src_sequence_length': src_sequence_length
+        }, predicted_ids
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in vars(args).iteritems():
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def padding_data(data, padding_size, value):
+    data = data + [value] * padding_size
+    return data[:padding_size]
+
+
+def save(sess, path, var_list=None, global_step=None):
+    saver = tf.train.Saver(var_list)
+    save_path = saver.save(sess, save_path=path, global_step=global_step)
+    print('Model save at %s' % save_path)
+
+
+def restore(sess, path, var_list=None):
+    # var_list = None returns the list of all saveable variables
+    saver = tf.train.Saver(var_list)
+    saver.restore(sess, save_path=path)
+    print('model restored from %s' % path)
+
+
+def adapt_batch_data(data):
+    src_seq = map(lambda x: x[0], data)
+    trg_seq = map(lambda x: x[1], data)
+    lbl_seq = map(lambda x: x[2], data)
+
+    src_sequence_length = np.array(
+        [len(seq) for seq in src_seq]).astype('int32')
+    src_seq_maxlen = np.max(src_sequence_length)
+
+    trg_sequence_length = np.array(
+        [len(seq) for seq in trg_seq]).astype('int32')
+    trg_seq_maxlen = np.max(trg_sequence_length)
+
+    src_seq = np.array(
+        [padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
+         for seq in src_seq]).astype('int32')
+
+    trg_seq = np.array(
+        [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
+         for seq in trg_seq]).astype('int32')
+
+    lbl_seq = np.array(
+        [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
+         for seq in lbl_seq]).astype('int32')
+
+    return {
+        'src_word_idx': src_seq,
+        'src_sequence_length': src_sequence_length,
+        'trg_word_idx': trg_seq,
+        'trg_sequence_length': trg_sequence_length,
+        'lbl_word_idx': lbl_seq
+    }
+
+
+def train():
+    feeding_dict, loss = seq_to_seq_net(
+        embedding_dim=args.embedding_dim,
+        encoder_size=args.encoder_size,
+        decoder_size=args.decoder_size,
+        source_dict_dim=args.dict_size,
+        target_dict_dim=args.dict_size,
+        is_generating=False,
+        beam_size=args.beam_size,
+        max_generation_length=args.max_generation_length)
+
+    global_step = tf.Variable(0, trainable=False, name='global_step')
+    trainable_params = tf.trainable_variables()
+    optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+
+    gradients = tf.gradients(loss, trainable_params)
+    # may clip the parameters
+    clip_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
+
+    updates = optimizer.apply_gradients(
+        zip(gradients, trainable_params), global_step=global_step)
+
+    src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
+
+    train_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    def do_validataion():
+        total_loss = 0.0
+        count = 0
+        for batch_id, data in enumerate(test_batch_generator()):
+            adapted_batch_data = adapt_batch_data(data)
+            outputs = sess.run([loss],
+                               feed_dict={
+                                   item[1]: adapted_batch_data[item[0]]
+                                   for item in feeding_dict.items()
+                               })
+            total_loss += outputs[0]
+            count += 1
+        return total_loss / count
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_l)
+        sess.run(init_g)
+        for pass_id in xrange(args.pass_num):
+            pass_start_time = time.time()
+            words_seen = 0
+            for batch_id, data in enumerate(train_batch_generator()):
+                adapted_batch_data = adapt_batch_data(data)
+                words_seen += np.sum(adapted_batch_data['src_sequence_length'])
+                words_seen += np.sum(adapted_batch_data['trg_sequence_length'])
+                outputs = sess.run([updates, loss],
+                                   feed_dict={
+                                       item[1]: adapted_batch_data[item[0]]
+                                       for item in feeding_dict.items()
+                                   })
+                print("pass_id=%d, batch_id=%d, train_loss: %f" %
+                      (pass_id, batch_id, outputs[1]))
+            pass_end_time = time.time()
+            test_loss = do_validataion()
+            time_consumed = pass_end_time - pass_start_time
+            words_per_sec = words_seen / time_consumed
+            print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
+                  (pass_id, test_loss, words_per_sec, time_consumed))
+
+
+def infer():
+    feeding_dict, predicted_ids = seq_to_seq_net(
+        embedding_dim=args.embedding_dim,
+        encoder_size=args.encoder_size,
+        decoder_size=args.decoder_size,
+        source_dict_dim=args.dict_size,
+        target_dict_dim=args.dict_size,
+        is_generating=True,
+        beam_size=args.beam_size,
+        max_generation_length=args.max_generation_length)
+
+    src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    with tf.Session(config=config) as sess:
+        restore(sess, './checkpoint/tf_seq2seq-1500')
+        for batch_id, data in enumerate(test_batch_generator()):
+            src_seq = map(lambda x: x[0], data)
+
+            source_language_seq = [
+                src_dict[item] for seq in src_seq for item in seq
+            ]
+
+            src_sequence_length = np.array(
+                [len(seq) for seq in src_seq]).astype('int32')
+            src_seq_maxlen = np.max(src_sequence_length)
+            src_seq = np.array([
+                padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
+                for seq in src_seq
+            ]).astype('int32')
+
+            outputs = sess.run([predicted_ids],
+                               feed_dict={
+                                   feeding_dict['src_word_idx']: src_seq,
+                                   feeding_dict['src_sequence_length']:
+                                   src_sequence_length
+                               })
+
+            print("\nDecoder result comparison: ")
+            source_language_seq = ' '.join(source_language_seq).lstrip(
+                '<s>').rstrip('<e>').strip()
+            inference_seq = ''
+            print(" --> source: " + source_language_seq)
+            for item in outputs[0][0]:
+                if item[0] == END_TOKEN_IDX: break
+                inference_seq += ' ' + trg_dict.get(item[0], '<unk>')
+            print(" --> inference: " + inference_seq)
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    if args.infer_only:
+        infer()
+    else:
+        train()
diff --git a/benchmark/tensorflow/mnist.py b/benchmark/tensorflow/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..7140eed6eaff49b5c65f9ccb2e38f113a4cdbdbf
--- /dev/null
+++ b/benchmark/tensorflow/mnist.py
@@ -0,0 +1,180 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+import numpy as np
+
+import tensorflow as tf
+import paddle.v2 as paddle
+
+DTYPE = tf.float32
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("mnist model benchmark.")
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--iterations', type=int, default=35, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=5, help='The number of passes.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    args = parser.parse_args()
+    return args
+
+
+def run_benchmark(args):
+    def weight_variable(dtype, shape):
+        initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
+        return tf.Variable(initial)
+
+    def bias_variable(dtype, shape):
+        initial = tf.constant(0.1, shape=shape, dtype=dtype)
+        return tf.Variable(initial)
+
+    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
+    with tf.device(device):
+        images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+
+        # conv1, relu, pool1
+        conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
+        conv1_bias = bias_variable(DTYPE, [20])
+        conv1 = tf.nn.conv2d(
+            images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
+        relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
+        pool1 = tf.nn.max_pool(
+            relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
+
+        # conv2, relu, pool2
+        conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
+        conv2_bias = bias_variable(DTYPE, [50])
+        conv2 = tf.nn.conv2d(
+            pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
+        relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
+        pool2 = tf.nn.max_pool(
+            relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
+
+        # FC 
+        pool_shape = pool2.get_shape().as_list()
+        hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
+        reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
+        fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
+        fc_bias = bias_variable(DTYPE, [10])
+        logits = tf.matmul(reshape, fc_weights) + fc_bias
+
+        # Get prediction
+        prediction = tf.nn.softmax(logits)
+
+        # Loss 
+        one_hot_labels = tf.one_hot(labels, depth=10)
+        cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
+        avg_cost = tf.reduce_mean(cost)
+
+        # Get accuracy
+        correct = tf.equal(tf.argmax(prediction, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        # metrics, g_accuracy
+        with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
+            g_accuracy = tf.metrics.accuracy(
+                labels, tf.argmax(
+                    prediction, axis=1))
+            vars = tf.contrib.framework.get_variables(
+                scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
+            g_accuracy_reset_op = tf.variables_initializer(vars)
+
+        # Optimizer 
+        opt = tf.train.AdamOptimizer(
+            learning_rate=0.001, beta1=0.9, beta2=0.999)
+        train_op = opt.minimize(avg_cost)
+        # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+
+    def eval_test():
+        sess.run(g_accuracy_reset_op)
+        for batch_id, data in enumerate(test_reader()):
+            images_data = np.array(
+                map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
+            labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
+
+            loss, acc, g_acc = sess.run(
+                [avg_cost, accuracy, g_accuracy],
+                feed_dict={images: images_data,
+                           labels: labels_data})
+        return g_acc[1]
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+        for pass_id in range(args.pass_num):
+            sess.run(g_accuracy_reset_op)
+
+            pass_start = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                images_data = np.array(
+                    map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
+                labels_data = np.array(map(lambda x: x[1], data)).astype(
+                    "int64")
+
+                start = time.time()
+                _, loss, acc, g_acc = sess.run(
+                    [train_op, avg_cost, accuracy, g_accuracy],
+                    feed_dict={images: images_data,
+                               labels: labels_data})
+                end = time.time()
+
+                print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
+                      (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+
+            pass_end = time.time()
+            test_avg_acc = eval_test()
+
+            print(
+                "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
+                % (pass_id, g_acc[1], test_avg_acc,
+                   (pass_end - pass_start) / 1000))
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    run_benchmark(args)
diff --git a/benchmark/tensorflow/resnet.py b/benchmark/tensorflow/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..c432fa8d59571e128b9ff9e3ffa1949b792ef3a4
--- /dev/null
+++ b/benchmark/tensorflow/resnet.py
@@ -0,0 +1,504 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py
+
+Get help: python resnet.py --help
+See performance on flowers: python resnet.py
+Train on cifar10: python resnet.py --data=cifar10 --with_test
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+import numpy as np
+
+import paddle.v2 as paddle
+import tensorflow as tf
+
+DTYPE = tf.float32
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Convolution model benchmark.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=['resnet'],
+        default='resnet',
+        help='The model architecture.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='use real data or fake data')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations',
+        type=int,
+        default=105,
+        help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=300, help='The number of passes.')
+    parser.add_argument(
+        '--order',
+        type=str,
+        default='NHWC',
+        choices=['NCHW', 'NHWC'],
+        help='The data order, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--data',
+        type=str,
+        default='flowers102',
+        choices=['flowers102', 'cifar10'],
+        help='The kinds of data.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[
+        'with_test'] else vars(args)['iterations']
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def fixed_padding(inputs, kernel_size, data_format):
+    """Pads the input along the spatial dimensions independently of input size.
+  Args:
+    inputs: A tensor of size [batch, channels, height_in, width_in] or
+      [batch, height_in, width_in, channels] depending on data_format.
+    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
+                 Should be a positive integer.
+    data_format: The input format ('channels_last' or 'channels_first').
+  Returns:
+    A tensor with the same format as the input with the data either intact
+    (if kernel_size == 1) or padded (if kernel_size > 1).
+  """
+    pad_total = kernel_size - 1
+    pad_beg = pad_total // 2
+    pad_end = pad_total - pad_beg
+
+    if data_format == 'channels_first':
+        padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end],
+                                        [pad_beg, pad_end]])
+    else:
+        padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
+                                        [pad_beg, pad_end], [0, 0]])
+    return padded_inputs
+
+
+def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
+    """Strided 2-D convolution with explicit padding."""
+    # The padding is consistent and is based only on `kernel_size`, not on the
+    # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
+    # This is consistent with PaddlePaddle.
+    # In addition, the calculation for output size in TensorFlow can refer: 
+    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc
+    if strides > 1:
+        inputs = fixed_padding(inputs, kernel_size, data_format)
+
+    return tf.layers.conv2d(
+        inputs=inputs,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=('SAME' if strides == 1 else 'VALID'),
+        use_bias=False,
+        kernel_initializer=tf.variance_scaling_initializer(),
+        data_format=data_format)
+
+
+def conv_bn(inputs,
+            filters,
+            kernel_size,
+            strides,
+            is_training,
+            data_format,
+            act=True):
+    # def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
+    # set fused=True for a significant performance boost. See
+    # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
+    inputs = conv2d_fixed_padding(
+        inputs=inputs,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        data_format=data_format)
+    inputs = tf.layers.batch_normalization(
+        inputs=inputs,
+        axis=1 if data_format == 'channels_first' else 3,
+        momentum=0.9,
+        epsilon=1e-05,
+        center=True,
+        scale=True,
+        training=is_training,
+        fused=True)
+    if act:
+        inputs = tf.nn.relu(inputs)
+    return inputs
+
+
+def basicblock(inputs, filters, is_training, projection_shortcut, strides,
+               data_format):
+    shortcut = inputs
+    if projection_shortcut is not None:
+        shortcut = projection_shortcut(inputs)
+    inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format)
+    inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
+    inputs = inputs + shortcut
+    inputs = tf.nn.relu(inputs)
+    return inputs
+
+
+def bottleneck(inputs, filters, is_training, projection_shortcut, strides,
+               data_format):
+    shortcut = inputs
+    if projection_shortcut is not None:
+        shortcut = projection_shortcut(inputs)
+    inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format)
+    inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
+    inputs = conv_bn(
+        inputs, filters * 4, 1, 1, is_training, data_format, act=False)
+    inputs = inputs + shortcut
+    inputs = tf.nn.relu(inputs)
+    return inputs
+
+
+def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name,
+                data_format):
+    # Bottleneck blocks end with 4x the number of filters as they start with
+    filters_out = 4 * filters if block_fn is bottleneck else filters
+
+    def projection_shortcut(inputs):
+        return conv2d_fixed_padding(
+            inputs=inputs,
+            filters=filters_out,
+            kernel_size=1,
+            strides=strides,
+            data_format=data_format)
+
+    # Only the first block per block_layer uses projection_shortcut and strides
+    inputs = block_fn(inputs, filters, is_training, projection_shortcut,
+                      strides, data_format)
+
+    for _ in range(1, blocks):
+        inputs = block_fn(inputs, filters, is_training, None, 1, data_format)
+
+    return tf.identity(inputs, name)
+
+
+def resnet_imagenet(depth, class_dim, data_format):
+    """Returns the ResNet model for a given size and number of output classes."""
+
+    def resnet_generator(block_fn,
+                         layers,
+                         num_classes,
+                         data_format='channels_last'):
+        if data_format is None:
+            data_format = ('channels_first'
+                           if tf.test.is_built_with_cuda() else 'channels_last')
+
+        def model(inputs, is_training):
+            """Constructs the ResNet model given the inputs."""
+            if data_format == 'channels_first':
+                # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
+                # This provides a large performance boost on GPU. See
+                # https://www.tensorflow.org/performance/performance_guide#data_formats
+                inputs = tf.transpose(inputs, [0, 3, 1, 2])
+
+            inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format)
+            inputs = tf.identity(inputs, 'initial_conv')
+            inputs = tf.layers.max_pooling2d(
+                inputs=inputs,
+                pool_size=3,
+                strides=2,
+                padding='SAME',
+                data_format=data_format)
+            inputs = tf.identity(inputs, 'initial_max_pool')
+            inputs = block_layer(inputs, 64, block_fn, layers[0], 1,
+                                 is_training, 'block_layer1', data_format)
+            inputs = block_layer(inputs, 128, block_fn, layers[1], 2,
+                                 is_training, 'block_layer2', data_format)
+            inputs = block_layer(inputs, 256, block_fn, layers[2], 2,
+                                 is_training, 'block_layer3', data_format)
+            inputs = block_layer(inputs, 512, block_fn, layers[3], 2,
+                                 is_training, 'block_layer4', data_format)
+            inputs = tf.layers.average_pooling2d(
+                inputs=inputs,
+                pool_size=7,
+                strides=1,
+                padding='VALID',
+                data_format=data_format)
+            inputs = tf.identity(inputs, 'final_avg_pool')
+            inputs = tf.reshape(inputs,
+                                [-1, 512 if block_fn is basicblock else 2048])
+            inputs = tf.layers.dense(inputs=inputs, units=num_classes)
+            inputs = tf.identity(inputs, 'final_dense')
+            return inputs
+
+        return model
+
+    model_params = {
+        18: {
+            'block': basicblock,
+            'layers': [2, 2, 2, 2]
+        },
+        34: {
+            'block': basicblock,
+            'layers': [3, 4, 6, 3]
+        },
+        50: {
+            'block': bottleneck,
+            'layers': [3, 4, 6, 3]
+        },
+        101: {
+            'block': bottleneck,
+            'layers': [3, 4, 23, 3]
+        },
+        152: {
+            'block': bottleneck,
+            'layers': [3, 8, 36, 3]
+        },
+        200: {
+            'block': bottleneck,
+            'layers': [3, 24, 36, 3]
+        }
+    }
+    if depth not in model_params:
+        raise ValueError('Not a valid depth:', depth)
+    params = model_params[depth]
+    return resnet_generator(params['block'], params['layers'], class_dim,
+                            data_format)
+
+
+def resnet_cifar10(depth, num_classes, data_format):
+    if depth % 6 != 2:
+        raise ValueError('depth must be 6n + 2:', depth)
+
+    num_blocks = (depth - 2) // 6
+
+    if data_format is None:
+        data_format = ('channels_first'
+                       if tf.test.is_built_with_cuda() else 'channels_last')
+
+    def model(inputs, is_training):
+        inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format)
+        inputs = tf.identity(inputs, 'initial_conv')
+        inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training,
+                             'block_layer1', data_format)
+        inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training,
+                             'block_layer2', data_format)
+        inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training,
+                             'block_layer3', data_format)
+        inputs = tf.layers.average_pooling2d(
+            inputs=inputs,
+            pool_size=8,
+            strides=1,
+            padding='VALID',
+            data_format=data_format)
+        inputs = tf.identity(inputs, 'final_avg_pool')
+        inputs = tf.reshape(inputs, [-1, 64])
+        inputs = tf.layers.dense(inputs=inputs, units=num_classes)
+        inputs = tf.identity(inputs, 'final_dense')
+        return inputs
+
+    return model
+
+
+def run_benchmark(args, data_format='channels_last', device='/cpu:0'):
+    """Our model_fn for ResNet to be used with our Estimator."""
+
+    class_dim = 1000
+    dshape = (None, 224, 224, 3)
+
+    pdshape = (3, 224, 224)
+    if args.data == 'flowers102':
+        class_dim = 102
+        dshape = (None, 224, 224, 3)
+        pdshape = (3, 224, 224)
+    elif args.data == 'cifar10':
+        class_dim = 10
+        dshape = (None, 32, 32, 3)
+        pdshape = (3, 32, 32)
+
+    with tf.device(device):
+        images = tf.placeholder(DTYPE, shape=dshape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+
+        network = resnet_cifar10(
+            32, class_dim,
+            data_format) if args.data == 'cifar10' else resnet_imagenet(
+                50, class_dim, data_format)
+
+        logits = network(inputs=images, is_training=is_training)
+
+        cross_entropy = tf.losses.softmax_cross_entropy(
+            logits=logits, onehot_labels=onehot_labels)
+        avg_cost = tf.reduce_mean(cross_entropy)
+
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        lr = 0.1 if args.data == 'cifar10' else 0.01
+        optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
+
+        # Batch norm requires update_ops to be added as a train_op dependency.
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=100)
+
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+                map(lambda x: np.transpose(x[0].reshape(pdshape),
+                axes=[1, 2, 0]), data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" %
+              (pass_id, num_samples / train_elapsed, np.mean(test_accs)))
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+
+        if args.use_fake_data:
+            data = train_reader().next()
+            images_data = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(pdshape),
+                    axes=[1, 2, 0]), data)).astype("float32")
+            labels_data = np.array(map(lambda x: x[1], data)).astype('int64')
+        iters, num_samples, start_time = 0, 0, 0.0
+        for pass_id in range(args.pass_num):
+            if iters == args.iterations:
+                break
+            train_accs = []
+            train_losses = []
+            for batch_id, data in enumerate(train_reader()):
+                if iters == args.skip_batch_num:
+                    start_time = time.time()
+                    num_samples = 0
+                if iters == args.iterations:
+                    break
+                if not args.use_fake_data:
+                    images_data = np.array(
+                        map(lambda x: np.transpose(x[0].reshape(pdshape),
+                        axes=[1, 2, 0]), data)).astype("float32")
+                    labels_data = np.array(map(lambda x: x[1], data)).astype(
+                        'int64')
+                _, loss, acc = sess.run([train_op, avg_cost, accuracy],
+                                        feed_dict={
+                                            images: images_data,
+                                            labels: labels_data,
+                                            is_training: True
+                                        })
+                iters += 1
+                train_accs.append(acc)
+                train_losses.append(loss)
+                num_samples += len(data)
+                print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" %
+                      (pass_id, iters, loss, acc))
+
+            train_elapsed = time.time() - start_time
+            print("Pass=%d, Loss=%f, Accuray=%f\n" %
+                  (pass_id, np.mean(train_losses), np.mean(train_accs)))
+
+            # evaluation
+            if args.with_test:
+                test()
+
+        if not args.with_test:
+            duration = time.time() - start_time
+            examples_per_sec = num_samples / duration
+            sec_per_batch = duration / (iters - args.skip_batch_num)
+
+            print('Total examples: %d, total time: %.5f' %
+                  (num_samples, duration))
+            print('%.5f examples/sec, %.5f sec/batch' %
+                  (examples_per_sec, sec_per_batch))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if tf.test.is_built_with_cuda():
+        device = '/device:GPU:0'
+        if args.order == 'NHWC':
+            data_format = 'channels_last'
+        else:
+            data_format = 'channels_first'
+    else:
+        device = '/cpu:0'
+        if args.order == 'NHWC':
+            data_format = 'channels_last'
+        else:
+            raise ValueError('Only support NHWC order in CPU mode')
+
+    run_benchmark(args, data_format, device)
diff --git a/benchmark/tensorflow/stacked_dynamic_lstm.py b/benchmark/tensorflow/stacked_dynamic_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5285033005044d907d0b7e91eb66ee7281c4f27a
--- /dev/null
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
@@ -0,0 +1,220 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import tensorflow as tf
+
+import paddle.v2 as paddle
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("LSTM model benchmark.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--stacked_num',
+        type=int,
+        default=5,
+        help='Number of lstm layers to stack. (default: %(default)d)')
+    parser.add_argument(
+        '--embedding_dim',
+        type=int,
+        default=512,
+        help='Dimension of embedding table. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=512,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=10,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=0.0002,
+        help='Learning rate used to train. (default: %(default)f)')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def dynamic_lstm_model(dict_size,
+                       embedding_dim,
+                       hidden_dim,
+                       stacked_num,
+                       class_num=2,
+                       is_train=True):
+    word_idx = tf.placeholder(tf.int64, shape=[None, None])
+    sequence_length = tf.placeholder(tf.int64, shape=[None, ])
+
+    embedding_weights = tf.get_variable('word_embeddings',
+                                        [dict_size, embedding_dim])
+    embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
+
+    lstm_cell = tf.nn.rnn_cell.LSTMCell(
+        num_units=hidden_dim, use_peepholes=False)
+    stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
+
+    # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
+    _, final_state = tf.nn.dynamic_rnn(
+        cell=stacked_cell,
+        inputs=embedding,
+        dtype=tf.float32,
+        sequence_length=sequence_length)
+
+    w = tf.Variable(
+        tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
+    bias = tf.Variable(
+        tf.constant(
+            value=0.0, shape=[class_num], dtype=tf.float32))
+    prediction = tf.matmul(final_state[-1][1], w) + bias
+
+    if not is_train:
+        return (word_idx, sequence_length), tf.nn.softmax(prediction)
+
+    label = tf.placeholder(tf.int64, shape=[None, ])
+    loss = tf.nn.softmax_cross_entropy_with_logits(
+        labels=tf.one_hot(label, 2), logits=prediction)
+    avg_loss = tf.reduce_mean(loss)
+
+    correct_count = tf.equal(tf.argmax(prediction, 1), label)
+    acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
+
+    with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
+        g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
+        vars = tf.contrib.framework.get_variables(
+            scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
+        reset_op = tf.variables_initializer(vars)
+
+    return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
+
+
+def padding_data(data, padding_size, value):
+    data = data + [value] * padding_size
+    return data[:padding_size]
+
+
+def train(args):
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_size = len(word_dict)
+
+    feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
+        dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
+
+    adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+    train_op = adam_optimizer.minimize(avg_loss)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=args.batch_size)
+
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.test(word_dict), buf_size=25000),
+        batch_size=args.batch_size)
+
+    def do_validation(sess):
+        sess.run(reset_op)
+        for batch_id, data in enumerate(test_reader()):
+            word_idx = map(lambda x: x[0], data)
+            sequence_length = np.array(
+                [len(seq) for seq in word_idx]).astype('int64')
+            maxlen = np.max(sequence_length)
+            word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
+            word_idx = np.array(word_idx).astype('int64')
+            label = np.array(map(lambda x: x[1], data)).astype('int64')
+
+            _, loss, fetch_acc, fetch_g_acc = sess.run(
+                [train_op, avg_loss, acc, g_acc],
+                feed_dict={
+                    feeding_list[0]: word_idx,
+                    feeding_list[1]: sequence_length,
+                    feeding_list[2]: label
+                })
+
+        return fetch_g_acc[1]
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_l)
+        sess.run(init_g)
+
+        for pass_id in xrange(args.pass_num):
+            # clear accuracy local variable 
+            sess.run(reset_op)
+            pass_start_time = time.time()
+            words_seen = 0
+
+            for batch_id, data in enumerate(train_reader()):
+                word_idx = map(lambda x: x[0], data)
+                sequence_length = np.array(
+                    [len(seq) for seq in word_idx]).astype('int64')
+                words_seen += np.sum(sequence_length)
+                maxlen = np.max(sequence_length)
+                word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
+                word_idx = np.array(word_idx).astype('int64')
+                label = np.array(map(lambda x: x[1], data)).astype('int64')
+
+                _, loss, fetch_acc, fetch_g_acc = sess.run(
+                    [train_op, avg_loss, acc, g_acc],
+                    feed_dict={
+                        feeding_list[0]: word_idx,
+                        feeding_list[1]: sequence_length,
+                        feeding_list[2]: label
+                    })
+
+                print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
+                      % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
+
+            pass_end_time = time.time()
+            time_consumed = pass_end_time - pass_start_time
+            words_per_sec = words_seen / time_consumed
+            test_acc = do_validation(sess)
+            print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
+                  (pass_id, test_acc, words_per_sec, time_consumed))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+
+    if args.infer_only:
+        pass
+    else:
+        train(args)
diff --git a/benchmark/tensorflow/vgg.py b/benchmark/tensorflow/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba5ec71a46b3ac8b2e1244424c39fd5192e5458
--- /dev/null
+++ b/benchmark/tensorflow/vgg.py
@@ -0,0 +1,324 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in TensorFlow"""
+import tensorflow as tf
+import paddle.v2 as paddle
+import numpy as np
+import argparse
+import time
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NHWC',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, NCHW=[batch, channels, height, width].'
+    'Only support NHWC right now.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+args = parser.parse_args()
+
+
+class VGG16Model(object):
+    def __init__(self):
+        self.parameters = []
+
+    def batch_norm_relu(self, inputs, is_training):
+        """Performs a batch normalization followed by a ReLU."""
+        # We set fused=True for a significant speed boost. See
+        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
+        inputs = tf.layers.batch_normalization(
+            inputs=inputs,
+            axis=1 if args.data_format == 'NCHW' else -1,
+            momentum=0.9,
+            epsilon=1e-05,
+            center=True,
+            scale=True,
+            training=is_training,
+            fused=True)
+        inputs = tf.nn.relu(inputs)
+        return inputs
+
+    def conv_bn_layer(self,
+                      name,
+                      images,
+                      kernel_shape,
+                      is_training,
+                      drop_rate=0.0):
+        with tf.name_scope(name) as scope:
+            kernel = tf.Variable(
+                tf.truncated_normal(
+                    kernel_shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            conv = tf.nn.conv2d(
+                images,
+                kernel, [1, 1, 1, 1],
+                data_format=args.data_format,
+                padding='SAME')
+            biases = tf.Variable(
+                tf.constant(
+                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(conv, biases)
+            out = self.batch_norm_relu(out, is_training)
+            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
+            return out
+
+    def fc_layer(self, name, inputs, shape):
+        with tf.name_scope(name) as scope:
+            fc_w = tf.Variable(
+                tf.truncated_normal(
+                    shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            fc_b = tf.Variable(
+                tf.constant(
+                    0.0, shape=[shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
+            return out
+
+    def network(self, images, class_dim, is_training):
+        """ VGG16 model structure.
+
+            TODO(kuke): enable this network to support the 'NCHW' data format
+        """
+
+        # conv1
+        conv1_1 = self.conv_bn_layer(
+            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
+        conv1_2 = self.conv_bn_layer(
+            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
+        # pool1
+        pool1 = tf.nn.max_pool(
+            conv1_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool1')
+        # conv2
+        conv2_1 = self.conv_bn_layer(
+            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
+        conv2_2 = self.conv_bn_layer(
+            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
+        # pool2
+        pool2 = tf.nn.max_pool(
+            conv2_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool2')
+        # conv3
+        conv3_1 = self.conv_bn_layer(
+            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
+        conv3_2 = self.conv_bn_layer(
+            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
+        conv3_3 = self.conv_bn_layer(
+            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
+        # pool3
+        pool3 = tf.nn.max_pool(
+            conv3_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool3')
+        # conv4
+        conv4_1 = self.conv_bn_layer(
+            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
+        conv4_2 = self.conv_bn_layer(
+            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv4_3 = self.conv_bn_layer(
+            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool4
+        pool4 = tf.nn.max_pool(
+            conv4_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # conv5
+        conv5_1 = self.conv_bn_layer(
+            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_2 = self.conv_bn_layer(
+            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_3 = self.conv_bn_layer(
+            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool5
+        pool5 = tf.nn.max_pool(
+            conv5_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # flatten
+        shape = int(np.prod(pool5.get_shape()[1:]))
+        pool5_flat = tf.reshape(pool5, [-1, shape])
+        # fc1
+        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
+        fc1 = self.fc_layer('fc1', drop, [shape, 512])
+        # fc2
+        bn = self.batch_norm_relu(fc1, is_training)
+        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
+        fc2 = self.fc_layer('fc2', drop, [512, 512])
+
+        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
+
+        return fc3
+
+
+def run_benchmark():
+    """Run benchmark on cifar10 or flowers."""
+
+    if args.data_set == "cifar10":
+        class_dim = 10
+        raw_shape = (3, 32, 32)
+        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
+            None, 3, 32, 32)
+    else:
+        class_dim = 102
+        raw_shape = (3, 224, 224)
+        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
+            None, 3, 224, 224)
+
+    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
+
+    with tf.device(device):
+        images = tf.placeholder(tf.float32, shape=dat_shape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+
+        vgg16 = VGG16Model()
+        logits = vgg16.network(images, class_dim, is_training)
+        loss = tf.losses.softmax_cross_entropy(
+            onehot_labels=onehot_labels, logits=logits)
+        avg_loss = tf.reduce_mean(loss)
+
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_loss)
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+
+    # test
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+         map(lambda x: np.transpose(x[0].reshape(raw_shape),
+         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        return np.mean(test_accs)
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+        iters, num_samples, start_time = 0, 0, time.time()
+        for pass_id in range(args.num_passes):
+            # train
+            num_samples = 0
+            start_time = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                if iters == args.skip_batch_num:
+                    start_time = time.time()
+                    num_samples = 0
+                if iters == args.iterations:
+                    break
+                train_images = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
+                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+                train_labels = np.array(map(lambda x: x[1], data)).astype(
+                    'int64')
+                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
+                                        feed_dict={
+                                            images: train_images,
+                                            labels: train_labels,
+                                            is_training: True
+                                        })
+                iters += 1
+                num_samples += len(data)
+                print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
+                      (pass_id, iters, loss, acc))
+            train_elapsed = time.time() - start_time
+            # test
+            pass_test_acc = test()
+            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
+                  (pass_id, num_samples / train_elapsed, pass_test_acc))
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    print_arguments()
+    run_benchmark()
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index df3f0c7f0c31efaa127515bb98e5668b8f9df199..796bcf28a1dfb308ccb7a2f839742c5c2fcf2002 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
 
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
+SET(MKLML_URL           "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 5377a0b046a796cd6f0bb1fb466e1cd0b4b678bf..8f7a3bf8eeaef75c8840f4ea318b484d33249bb7 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -54,5 +54,7 @@ add_library(snappystream STATIC IMPORTED GLOBAL)
 set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
         "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 
-include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
+include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
+include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
+
 add_dependencies(snappystream extern_snappystream)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 9a9a20f897e09b823dfb19ff841c3f2aeb3f9fe6..a631ad14b18310598f7eea3a51839d61a9e456ff 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -62,7 +62,8 @@ ExternalProject_Add(
 )
 
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
 
 ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 20b8506e678af4db6ccb65bef99d28e085a67bf2..c3d73235453c8c9fd2859c3ab142888e8bda2dbe 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -25,7 +25,8 @@ ELSE(WIN32)
   SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
 ENDIF(WIN32)
 
-INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
 
 ExternalProject_Add(
     extern_zlib
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 3fe750f47efc149bb1af6086841bffd5dd8e85fd..c4c9f77df8d57fe162616d2250bd4dfe5b7754e7 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -244,14 +244,14 @@ function(cc_test TARGET_NAME)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
     # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
       list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
     endif()
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
 endfunction(cc_test)
 
@@ -311,8 +311,8 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
   endif()
 endfunction(nv_test)
@@ -387,8 +387,8 @@ function(hip_test TARGET_NAME)
     endif()
     add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
     set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
-    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
     add_test(${TARGET_NAME} ${TARGET_NAME})
   endif()
 endfunction(hip_test)
@@ -561,9 +561,9 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
+             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
 endfunction()
 
diff --git a/doc/design/file_manager/README.md b/doc/design/file_manager/README.md
deleted file mode 100644
index 3df10d801e568834729f902aace483d033340e2d..0000000000000000000000000000000000000000
--- a/doc/design/file_manager/README.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# FileManager设计文档
-## 目标
-在本文档中，我们设计说明了名为FileManager系统，方便用户上传自己的训练数据以进行分布式训练
-
-主要功能包括：
-
-- 提供常用的命令行管理命令管理文件和目录
-- 支持大文件的断点上传、下载  
-
-## 名词解释
-- PFS：是`Paddlepaddle cloud File System`的缩写，是对用户文件存储空间的抽象，与之相对的是local filesystem。目前我们用CephFS来搭建。
-- [CephFS](http://docs.ceph.com/docs/master/cephfs/)：一个POSIX兼容的文件系统。
-- Chunk：逻辑划上文件分块的单位。
-
-## 模块
-### 架构图
-<image src=./src/filemanager.png width=900>
-
-### PFSClient
-- 功能： 详细设计[link](./pfs/pfsclient.md)
-	- 提供用户管理文件的命令
-	- 需要可以跨平台执行
-
-- 双向验证   
-	PFSClient需要和Ingress之间做双向验证<sup>[tls](#tls)</sup>，所以用户需要首先在`cloud.paddlepaddle.org`上注册一下，申请用户空间，并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地，然后才能使用PFSClient。
-		
-### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/)
-- 功能：  
-	提供七层协议的反向代理、基于粘性会话的负载均衡功能。
-	
-- 透传用户身份的办法  
-	Ingress需要把PFSClient的身份信息传给PFSServer，配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3)
-
-### PFSServer
-PFSServer提供RESTful API接口，接收处理PFSClient端的文件管理请求，并且把结果返回PFSClient端。
-
-RESTful API
-
-- /api/v1/files
-	- `GET /api/v1/files`: Get metadata of files or directories.
-	- `POST /api/v1/files`: Create files or directories.
-	- `PATCH /api/v1/files`: Update files or directories.
-	- `DELETE /api/v1/files`: Delete files or directories.
-
-- /api/v1/file/chunks
-	- `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file.
-
-- /api/v1/storage/files
-	- `GET /api/v1/storage/files`: Download files or directories.
-	- `POST /api/v1/storage/files`: Upload files or directories.
-
-- /api/v1/storage/file/chunks
-	- `GET /api/v1/storage/file/chunks`: Download chunks's data.
-	- `POST /api/v1/storage/file/chunks`: Upload chunks's data.
-
-## 文件传输优化
-
-### 分块文件传输
-用户文件可能是比较大的，上传到Cloud或者下载到本地的时间可能比较长，而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题，我们提出了Chunk的概念，一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小（默认256K），完成一个传输动作完成的时间也比较短，不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。
-
-一个典型的Chunk如下所示：
-
-```
-type Chunk struct {
-	fileOffset int64
-	checksum uint32
-	len     uint32
-	data    []byte
-}
-```  
-
-### 生成sparse文件
-当destination文件不存在或者大小和source文件不一致时，可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件，然后就可以并发写入多个Chunk。
-
-### 覆盖不一致的部分
-文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致，不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。
-
-## 用户使用流程
-参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md)
-
-## 框架生成
-用[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分，以便我们可以把更多的精力放到逻辑本身上。
-
-## 参考文档
-- <a name=tls></a>[TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md)
-- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/)
-- [linux man document](https://linux.die.net/man/)
diff --git a/doc/design/file_manager/pfs/pfsclient.md b/doc/design/file_manager/pfs/pfsclient.md
deleted file mode 100644
index 56bc70c54bbc92b78d66e04fb495b1300cf8ebe0..0000000000000000000000000000000000000000
--- a/doc/design/file_manager/pfs/pfsclient.md
+++ /dev/null
@@ -1,129 +0,0 @@
-# PFSClient
-
-## Description
-The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud
-
-## Synopsis
-```
-paddle [options] pfs <subcommand> [parameters]
-```
-
-## Options
-```
---profile (string)
-	Use a specific profile from your credential file.
-
---help (string)
-	Display more information about command
-
---version
-	Output version information and exit
-
---debug
-	Show detailed debugging log	
-	
---only-show-errors (boolean) 
-	Only errors and warnings are displayed. All other output is suppressed.
-```
-
-## Path Arguments
-When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`.  
-
-A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`.
-
-[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters.
-
-## order of Path Arguments
-Commonly, if there are two path arguments, the first is the source, and the second is the destination.
-
-## Subcommonds
-- rm - remove files or directories
-
-```
-Synopsis:
-	rm [-r] [-v] <PFSPath> ...
-
-Options:
-	-r 
-		Remove directories and their contents recursively 
-	-v      
-		Cause rm to be verbose, showing files after they are removed.
-	
-Examples:
-	paddle pfs rm /pfs/$DATACENTER/home/$USER/file
-	paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder
-```
-- mv - move (rename) files
-
-```
-Synopsis:
-	mv [-f | -n] [-v] <LocalPath> <PFSPath>
-	mv [-f | -n] [-v] <LocalPath> ... <PFSPath>
-	mv [-f | -n] [-v] <PFSPath> <LocalPath> 
-	mv [-f | -n] [-v] <PFSPath> ... <LocalPath> 
-	mv [-f | -n] [-v] <PFSPath> <PFSPath> 
-	mv [-f | -n] [-v] <PFSPath> ... <PFSPath> 
-	
-Options:
-	-f      
-		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
-	-n      
-		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
-	-v      
-		Cause mv to be verbose, showing files after they are moved.
-		
-Examples:
-	paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt
-```
-- cp - copy files or directories
-
-```
-Synopsis:
-	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> <PFSPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> ... <PFSPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <LocalPath> 
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <LocalPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <PFSPath> 
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <PFSPath>
-
-Options:
-	-r
-   		Copy directories recursively
-   	-f      
-		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
-	-n      
-		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
-	-v      
-		Cause cp to be verbose, showing files after they are copied.
-	--preserve--links
-	   Reserve links when copy links
-	   
-Examples:
-	paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file
-	paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file
-```
-- ls- list files
-
-```
-Synopsis:
-	ls [-r] <PFSPath> ...
-	
-Options:
-	-R
-   		List directory(ies) recursively
-
-Examples:
-	paddle pfs ls  /pfs/$DATACENTER/home/$USER/file
-	paddle pfs ls  /pfs/$DATACENTER/home/$USER/folder
-```
-
-- mkdir - mkdir directory(ies)
-Create intermediate directory(ies) as required.
-
-```
-Synopsis:
-	mkdir <PFSPath> ...
-
-Examples:
-	paddle pfs mkdir  /pfs/$DATACENTER/home/$USER/folder
-```
diff --git a/doc/design/file_manager/src/filemanager.graffle b/doc/design/file_manager/src/filemanager.graffle
deleted file mode 100644
index 7861a33072bc1908f69d12b37c20491dd8663103..0000000000000000000000000000000000000000
Binary files a/doc/design/file_manager/src/filemanager.graffle and /dev/null differ
diff --git a/doc/design/file_manager/src/filemanager.png b/doc/design/file_manager/src/filemanager.png
deleted file mode 100644
index 8139a19f5722f56d3c211f3ab0d3982f751134b9..0000000000000000000000000000000000000000
Binary files a/doc/design/file_manager/src/filemanager.png and /dev/null differ
diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt
index 9fe79323ef9377a459d8405cfa74c88c52ce9346..8086507bb4b7e870ad6d6091945ed07a00b5100b 100644
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
@@ -27,7 +27,7 @@ sphinx_add_target(paddle_fluid_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_fluid_docs gen_proto_py)
+add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)
 
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
@@ -50,6 +50,6 @@ sphinx_add_target(paddle_fluid_docs_cn
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
 
-add_dependencies(paddle_fluid_docs_cn gen_proto_py)
+add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)
 
 add_subdirectory(api)
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
index ca40dfb9644cea69329be0ec231378506c138bc0..48b396f0786adad1ba6cd41f72497f853e54bc38 100644
--- a/doc/fluid/api/CMakeLists.txt
+++ b/doc/fluid/api/CMakeLists.txt
@@ -19,4 +19,4 @@ sphinx_add_target(paddle_fluid_apis
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
+add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index ae35d8c53476b34cb18331364267dd7c8b94dd64..22e6fb13d7320986a60bc1ef5530187e0970c767 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -494,6 +494,12 @@ reshape
 ..  autofunction:: paddle.fluid.layers.reshape
     :noindex:
 
+pad
+---
+
+..  autofunction:: paddle.fluid.layers.pad
+    :noindex:
+
 scale
 -----
 
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
index 2c4edee9fe31d502ea62b9fe5c8757c0a4c5e79f..940d37fb31dcd0c50ea6c4c42b052d7cb23a9c47 100644
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -5,9 +5,11 @@ In a large scale machine learning setup where the size of the training data is h
 
 Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
 
-Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="./images/theta_star.gif"/><br/> . The averaging is done as follows:
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/theta_star.gif"/><br/> . The averaging is done as follows:
 
-<img src="./images/asgd.gif" align="center"/><br/>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/asgd.gif"><br />
+</p>
 
 We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
 
diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md
index ed3f5aab2882c16ca6ac1446b4c4d4d27a373af7..8ded0ad22f4013a521bf3bee260565dc5cf855ae 100644
--- a/doc/fluid/design/concepts/README.md
+++ b/doc/fluid/design/concepts/README.md
@@ -6,11 +6,33 @@ Here are some initial thoughts. Your comments are welcome!
 
 I think we need only the following few CMake functions to make a project description mean and clean:
 
-| C++ | CUDA C++ | Go |
-|---|---|---|
-| cc_library | nv_library | go_library |
-| cc_binary | nv_binary | go_binary |
-| cc_test | nv_test | go_test |
+<table>
+<thead>
+<tr>
+<th>C++</th>
+<th>CUDA C++</th>
+<th>Go</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cc_library </td>
+<td>nv_library </td>
+<td>go_library </td>
+</tr>
+<tr>
+<td>cc_binary </td>
+<td>nv_binary </td>
+<td>go_binary </td>
+</tr>
+<tr>
+<td> cc_test </td>
+<td> nv_test </td>
+<td> go_test </td>
+</tr>
+</tbody>
+</table>
+
 
 - The `_library` functions generate  .a files from source code.
 - The `_binary` functions generate executable binary files.
diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md
index 907a2def557fd472ac4d679c73447bd9107d1190..3b626bd89cd83a9428997abccfeeebbbbdbb3d38 100644
--- a/doc/fluid/design/concepts/block.md
+++ b/doc/fluid/design/concepts/block.md
@@ -14,11 +14,29 @@ In programming languages, a block is a pair of curly braces that includes local
 
 Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
 
-| programming languages | PaddlePaddle          |
-|-----------------------|-----------------------|
-| for, while loop       | RNN, WhileOp          |
-| if, if-else, switch   | IfElseOp, SwitchOp    |
-| sequential execution  | a sequence of layers  |
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>for, while loop </td>
+<td>RNN, WhileOp </td>
+</tr>
+<tr>
+<td>if, if-else, switch </td>
+<td>IfElseOp, SwitchOp </td>
+</tr>
+<tr>
+<td>sequential execution </td>
+<td>a sequence of layers </td>
+</tr>
+</tbody>
+</table>
+
 
 A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
 
@@ -26,12 +44,33 @@ A key difference is that a C++ program describes a one pass computation, whereas
 
 The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
 
-| programming languages | PaddlePaddle                    |
-|-----------------------|---------------------------------|
-| stack                 | scope hierarchy                 |
-| stack frame           | scope                           |
-| push at entering block| push at entering block          |
-| pop at leaving block  | destroy when minibatch completes|
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>stack </td>
+<td>scope hierarchy </td>
+</tr>
+<tr>
+<td>stack frame  </td>
+<td>scope </td>
+</tr>
+<tr>
+<td>push at entering block </td>
+<td>push at entering block </td>
+</tr>
+<tr>
+<td>pop at leaving block </td>
+<td>destroy when minibatch completes </td>
+</tr>
+</tbody>
+</table>
+
 
 1. In traditional programs:
 
diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
index 984b59f4c6971dfb6f46dfe342f2751f392c0e88..30bc488a18a28d349645d9d2502aae6691a69931 100644
--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -86,12 +86,40 @@ def layer.fc(X):
 
 We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
 
-
-| C++ functions/functors | mul          | add          |             |          |
-|------------------------|--------------|--------------|-------------|----------|
-| C++ operator class     | mulOp        | addOp        | FCOp        |          |
-| Python binding         | operator.mul | operator.add | operator.fc |          |
-| Python function        |              |              |             | layer.fc |
+<table>
+<thead>
+<tr>
+<th>C++ functions/functors</th>
+<th>mul</th>
+<th>add</th>
+<th></th>
+<th></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>C++ operator class </td>
+<td>mulOp</td>
+<td>addOp </td>
+<td>FCOp </td>
+<td></td>
+</tr>
+<tr>
+<td>Python binding  </td>
+<td>operator.mul</td>
+<td> operator.add </td>
+<td>operator.fc </td>
+<td></td>
+</tr>
+<tr>
+<td>Python function   </td>
+<td></td>
+<td></td>
+<td> </td>
+<td>layer.fc</td>
+</tr>
+</tbody>
+</table>
 
 
 This is how we differentiate layer and operators in PaddlePaddle:
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
index 10a8a7867fbf072f585fe3bfb1243e4e6bef4ec8..a88292e7888d0ebc64ee89ca315dfea38a12c71d 100644
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -2,12 +2,38 @@
 
 Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
 
-|                       | TensorFlow | PaddlePaddle |
-|-----------------------|------------|--------------|
-| RNN                   | Support    | Support      |
-| recursive RNN         | Support    | Support      |
-| padding zeros         | Must       | No need      |
-| blob data type        | Tensor     | LoDTensor    |
+<table>
+<thead>
+<tr>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>recursive RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>padding zeros </td>
+<td> Must </td>
+<td>No need </td>
+</tr>
+<tr>
+<td> blob data type </td>
+<td> Tensor</td>
+<td> LoDTensor </td>
+</tr>
+</tbody>
+</table>
+
 
 PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
 
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
index fcba08c07f40177d54a91048cb616198402a9d5d..6750323c0167bf1efbde6ef4fd670e88a5aa502a 100644
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -10,10 +10,27 @@ PaddlePaddle uses proto message to describe compile time program because :
 
 The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.
 
-| |compile time|runtime|
-|---|---|---|
-|Data|VarDesc(proto)|Variable(cpp)|
-|Operation|OpDesc(proto)|Operator(cpp)|
+<table>
+<thead>
+<tr>
+<th></th>
+<th>compile time</th>
+<th>runtime</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data </td>
+<td>VarDesc(proto) </td>
+<td>Variable(cpp) </td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>OpDesc(proto) </td>
+<td>Operator(cpp) </td>
+</tr>
+</tbody>
+</table>
 
 
 ## Definition of VarType
diff --git a/doc/fluid/design/concurrent/channel.md b/doc/fluid/design/concurrent/channel.md
index a00a3325e7b49381f0f82ebbf32b74683f02de5f..df67438bcc741ac521b00ee962fc13c93db21182 100644
--- a/doc/fluid/design/concurrent/channel.md
+++ b/doc/fluid/design/concurrent/channel.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-A Channel is a data structure that allows for synchronous interprocess 
+A Channel is a data structure that allows for synchronous interprocess
 communication via message passing.  It is a fundemental component of CSP
 (communicating sequential processes), and allows for users to pass data
 between threads without having to worry about synchronization.
@@ -18,7 +18,7 @@ Creates a new channel that takes in variables of a specific dtype.
 
 - **fluid.make_channel(dtype, capacity=0)**
   - **dtype**: The data type of variables being sent/received through channel
-  - **capacity**: The capacity of the channel.  A capacity of 0 represents 
+  - **capacity**: The capacity of the channel.  A capacity of 0 represents
     an unbuffered channel.  Capacity > 0 represents a buffered channel
 
 ```
@@ -40,8 +40,8 @@ fluid.channel_close(ch)
 
 ### Send data to a channel
 
-Sends a variable to a channel.  Currently, variables of dtype `LoDTensor`, 
-`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and 
+Sends a variable to a channel.  Currently, variables of dtype `LoDTensor`,
+`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and
 `ChannelHolder` are supported.
 
 By default, the data of the Variable is moved from the sender to the receiver,
@@ -52,7 +52,7 @@ however the user can optionally copy the data before performing the send.
   - **variable**: The variable to send to the channel
   - **is_copy**: If set to True, channel_send will perform a variable assign
   to copy the source variable to a new variable to be sent.
-  
+
 ```
 ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
 var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
@@ -68,7 +68,7 @@ receiving variable.
   - **channel**: The channel to receive the variable from
   - **return_variable**: The destination variable used to store the data of the
   variable received from the channel
-  
+
 ```
 ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
 var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
@@ -84,9 +84,9 @@ internal queues, locks, and conditional variables.
 ### QueueMessage
 
 QueueMessage encapsulates the state of the channel send/receive operation to be
-put in the **sendq/recvq**.  It contains a condition variable used to lock the 
+put in the **sendq/recvq**.  It contains a condition variable used to lock the
 thread (when there are no available sends/receives).  In addition, it contains
-a callback function to notify a thread when the QueueMessage is being 
+a callback function to notify a thread when the QueueMessage is being
 processed by the channel.
 
 ### Queues
@@ -108,21 +108,21 @@ channel_recv operation will put a new QueueMessage on the recvq and block the
 current thread under two conditions:
   1. The channel is buffered and there is no data on the buff_
   2. The channel is unbuffered and does not have a sender
-  
+
 ### State diagram
 
 #### Channel Send
 
 <p align="center">
-<img src="./images/channel_send.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_send.png"/><br/>
 </p>
-  
+
 #### Channel Receive
 
 <p align="center">
-<img src="./images/channel_recv.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_recv.png"/><br/>
 </p>
-  
+
 ## Limitations and Considerations
 
 ### Variable Copy
@@ -135,5 +135,5 @@ be sent before it is sent.
 
 Please note that this is acheived by adding an **assign** operator and creating
 a temporary variable that is sent in place of the original variable.  Please
-note that **assign** operator has limited support for only certain variables 
+note that **assign** operator has limited support for only certain variables
 datatypes.
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
index f022e67fd3a048cd7e53c91d9a1fd0506487b665..1859f983e9133674e69ecd506d7683ea926b2b8f 100644
--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -10,12 +10,42 @@ The answer relies on the fact that a `ProgramDesc` is similar to an abstract syn
 
 The following table compares concepts in Fluid and Go
 
-| Go | Fluid |
-|----|-------|
-|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) |
-| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
-| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
-| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Go</th>
+<th>Fluid</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>user-defined functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid">layers</a></td>
+<td></td>
+</tr>
+<tr>
+<td>control-flow and built-in functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators">intrinsics/operators</a></td>
+<td></td>
+</tr>
+<tr>
+<td>goroutines, channels </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h">class ThreadPool</a></td>
+<td></td>
+</tr>
+<tr>
+<td>runtime </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h">class Executor</a></td>
+<td></td>
+</tr>
+</tbody>
+</table>
+
 
 ## An Example Concurrent Program
 
@@ -77,11 +107,11 @@ message ProgramDesc {
       read(output = X)
       kube_get_workers_addrs(output = L)
       Y = tensor_array(len(L))
-      parallel_for(input = X, output = Y, 
+      parallel_for(input = X, output = Y,
                    attrs = {L, block_id(1)}) # referring to block 1
     ]
   }
-  
+
   block[1] = Block {
     parent = 0,
     vars = [x, y, index],
@@ -102,7 +132,7 @@ func main() {  //// block 0
   X = fluid.read(...)
   L = fluid.k8s.get_worker_addrs()
   Y = fluid.tensor_array(len(L))
-  fluid.parallel_for(X, L, 
+  fluid.parallel_for(X, L,
                      func(index int) {  //// block 1
                        x = X[index]
                        fluid.send(L[index], x)
@@ -116,7 +146,7 @@ An explanation of the above program:
 
 - `fluid.k8s` is a package that provides access to Kubernetes API.  
 - `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
-- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, 
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
 
   1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
   2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md
index 10d936860fab7e09241e968a63526c7d86d3e568..66d19f44baf861c7847e81ca83f61024ec877faf 100644
--- a/doc/fluid/design/concurrent/csp.md
+++ b/doc/fluid/design/concurrent/csp.md
@@ -13,14 +13,41 @@ Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously exe
 
 There were many concurrent programming models, implemented in various forms:
 
-| concurrent programming model | implementation |
-|-----|-----|
-| mutex | types and functions in standard libraries |
-| semaphore | types and functions in standard libraries |
-| communicating sequential processes (CSP) | Go programming language |
-| actor model | Erlang programming language |
-| message passing | MPI |
-| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
+<table>
+<thead>
+<tr>
+<th>concurrent programming model</th>
+<th>implementation</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>mutex </td>
+<td>types and functions in standard libraries </td>
+</tr>
+<tr>
+<td>semaphore </td>
+<td> types and functions in standard libraries </td>
+</tr>
+<tr>
+<td> communicating sequential processes (CSP)  </td>
+<td> Go programming language </td>
+</tr>
+<tr>
+<td> actor model  </td>
+<td> Erlang programming language </td>
+</tr>
+<tr>
+<td> message passing  </td>
+<td> MPI </td>
+</tr>
+<tr>
+<td> bulk synchronous parallel (BSP)   </td>
+<td> Pregel distributed programming framework </td>
+</tr>
+</tbody>
+</table>
+
 
 Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
 
@@ -118,9 +145,9 @@ There are four types of actions with a channel:
    ```go
    close(ch)
    ```
-   
+
    Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
-   
+
 There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
 
 1. A send to a nil channel blocks forever
diff --git a/doc/fluid/design/concurrent/select_op.md b/doc/fluid/design/concurrent/select_op.md
index 52c226bc94a4e8bfc5588705d7f65328840e91cc..4fcae57cc7932cdaebe549486e7f7cebf0bd038a 100644
--- a/doc/fluid/design/concurrent/select_op.md
+++ b/doc/fluid/design/concurrent/select_op.md
@@ -2,13 +2,13 @@
 
 ## Introduction
 
-In golang, the [**select**](https://golang.org/ref/spec#Select_statements) 
-statement lets a goroutine wait on multiple communication operations at the 
-same time. The **select** blocks until one of its cases can run, then 
-executes the case.  If multiple cases are ready to run, then one case is 
+In golang, the [**select**](https://golang.org/ref/spec#Select_statements)
+statement lets a goroutine wait on multiple communication operations at the
+same time. The **select** blocks until one of its cases can run, then
+executes the case.  If multiple cases are ready to run, then one case is
 choosen at random to be executed.
 
-With the introduction of CSP for Paddle, we mimic this behavior by 
+With the introduction of CSP for Paddle, we mimic this behavior by
 creating a ***select_op***.
 
 ## How to use it
@@ -17,11 +17,11 @@ The **select_op** is available as a c++ operator.  However most users
 will prefer to use the much simplier Python API.
 
 - **fluid.Select()**: Creates a select operator and adds it to the current
-block within the main program.  Also creates a sub block and adds it to the 
-main program.  This sub block is used to hold all variables and operators 
+block within the main program.  Also creates a sub block and adds it to the
+main program.  This sub block is used to hold all variables and operators
 used by the case statements.
- 
-Within the select block, users can add cases by 
+
+Within the select block, users can add cases by
 calling **select.case** or **select.default** method.
 
 - **fluid.Select.case(channel_action, channel, result_variable)**: Represents
@@ -37,13 +37,13 @@ execute.
 ```
 ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
 quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-            
+
 x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
 y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
- 
+
 while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
 while_op = While(cond=while_cond)    
- 
+
 with while_op.block():
     with fluid.Select() as select:
         with select.case(fluid.channel_send, channel, x):
@@ -99,17 +99,17 @@ blocks {
     }
   }
   // Create "select" operator.
-  // inputs: 
+  // inputs:
   //   X: All input variables used by operators within the select block
   //   case_to_execute: Variable filled in by select_op when it determines
   //     which case to execute.
   //  
   // outputs:
-  //   Out: All output variables referenced by operators within select block. 
-  // 
+  //   Out: All output variables referenced by operators within select block.
+  //
   // attrs:
   //   sub_block: The block id containing the select "cases"
-  //   cases:  Serialized list of all cases in the select op. 
+  //   cases:  Serialized list of all cases in the select op.
   //     Each case is serialized as: '<index>,<type>,<channel>,<value>'
   //     where type is 0 for default, 1 for send, and 2 for receive.
   //     No channel and values are needed for default cases.
@@ -150,7 +150,7 @@ into **X**.  It will also create a temp variable called **case_to_execute**.  Th
 filled in by the select_op after it has completed processing the case statements.
 
 If there are no available cases to execute (ie: all cases are blocked on channel operations, and
-there is no default statement), then the select_op will block the current thread.  The thread will 
+there is no default statement), then the select_op will block the current thread.  The thread will
 unblock once there is a channel operation affecting one of the case statements, at which point, the
 **select_op** will set the **case_to_execute** variable to the index of the case to execute.
 
@@ -247,17 +247,17 @@ blocks {
 
 ```
 
-Cases are represented by a **conditional_block operator**, whose's condition is set as the output of 
-equal(**case_to_execute**, **case_index**).  Since each case index is unique in this sub-block, 
+Cases are represented by a **conditional_block operator**, whose's condition is set as the output of
+equal(**case_to_execute**, **case_index**).  Since each case index is unique in this sub-block,
 only one case will be executed.
 
 ### select_op flow
 
 <p align="center">
-<img src="./images/select_op_workflow.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/select_op_workflow.png"/><br/>
 </p>
 
-The select algorithm is inspired by golang's select routine.  Please refer to 
+The select algorithm is inspired by golang's select routine.  Please refer to
 http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information.
 
 ## Backward Pass
diff --git a/doc/fluid/design/dist_train/distributed_architecture.md b/doc/fluid/design/dist_train/distributed_architecture.md
index a405cb6aaf80b9d2e8a1a9c774ca85cc7e62bbab..229cb47c17d633be6848bb35e58d33ec9b47ec3b 100644
--- a/doc/fluid/design/dist_train/distributed_architecture.md
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@@ -40,11 +40,11 @@ computation is only specified in Python code which sits outside of PaddlePaddle,
 
 Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
 
-<img src="src/compiler.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compiler.png"/>
 
 PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
 
-<img src="src/paddle-compile.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/paddle-compile.png"/>
 
 The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
 
@@ -60,7 +60,7 @@ For a detailed explanation, refer to this document -
 
 The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
 
-<img src="src/distributed_architecture.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/distributed_architecture.png"/>
 
 The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
 
@@ -152,7 +152,7 @@ for data in train_reader():
 `JobDesc` object describe the distributed job resource specification to run on
 Cluster environment.
 
-<img src="src/remote_executor.png" width="500" align="center" />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/remote_executor.png" width="500" align="center" />
 
 `RemoteExecutor.run` sends the `ProgramDesc` and
 [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
@@ -171,7 +171,7 @@ In the future, a more general placement algorithm should be implemented, which m
 
 The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
 
-<img src="src/local_architecture.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local_architecture.png"/>
 
 
 ### Training Data
diff --git a/doc/fluid/design/dist_train/multi_cpu.md b/doc/fluid/design/dist_train/multi_cpu.md
index a8d8ee0422acc84835170a44eb83f9b5f0c6bb40..38222d083084ebfca3099ce96b47868c42d55101 100644
--- a/doc/fluid/design/dist_train/multi_cpu.md
+++ b/doc/fluid/design/dist_train/multi_cpu.md
@@ -8,11 +8,11 @@ Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
 
 ## Transpiler
 
-<img src="src/multi-threads/single-thread@3x.png" width="300">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/single-thread@3x.png" width="300">
 
 After converted:
 
-<img src="src/multi-threads/multi-threads@3x.png" width="1000">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/multi-threads@3x.png" width="1000">
 
 ## Implement
 
diff --git a/doc/fluid/design/dist_train/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md
index 6ce48dfbfce8b094684b412ebfda7e505ddc30ae..73c85da5e89eee0ac7857a0b808bc64ae673fdad 100644
--- a/doc/fluid/design/dist_train/parameter_server.md
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -41,11 +41,11 @@ We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
 Below is an example of converting the user defined graph to the
 subgraphs for the trainer and the parameter server:
 
-<img src="src/local-graph.png" width="300"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local-graph.png" width="300"/>
 
 After converting:
 
-<img src="src/dist-graph.png" width="700"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dist-graph.png" width="700"/>
 
 1. The parameter variable W and its optimizer program are placed on the parameter server.
 1. Operators are added to the program.
@@ -69,8 +69,7 @@ In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list o
 non-zero gradient data. So when we do parameter optimization both locally and remotely,
 we only need to send those non-zero rows to the optimizer operators:
 
-<img src="src/sparse_update.png" width="700" />
-
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sparse_update.png" width="700" />
 ### Benefits
 
 - Model parallelism becomes easier to implement: it is an extension to
diff --git a/doc/fluid/design/dynamic_rnn/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md
index 6f414e5549b149bc88fb252085ff56dbb06730f8..7b61b050f640814d6949cf6847b431da53d59581 100644
--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -5,7 +5,7 @@ This document describes the RNN (Recurrent Neural Network) operator and how it i
 ## RNN Algorithm Implementation
 
 <p align="center">
-<img src="./rnn.jpg"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.jpg"/>
 </p>
 
 The above diagram shows an RNN unrolled into a full network.
@@ -22,7 +22,7 @@ There are several important concepts here:
 There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
 
 <p align="center">
-<img src="./rnn.png"/><br/>
+<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn.png"/><br/>
 Figure 2 illustrates the RNN's data flow
 </p>
 
@@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par
 The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
 
 <p align="center">
-<img src="./2_level_rnn.png"/>
+<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/2_level_rnn.png"/>
 </p>
 
 ```python
@@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st
 
 
 <p align="center">
-<img src="./rnn_2level_data.png"/>
+<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn_2level_data.png"/>
 </p>
diff --git a/doc/fluid/design/modules/batch_norm_op.md b/doc/fluid/design/modules/batch_norm_op.md
index d1392619c42d9206bf4bddcd33ad11b033e6cbdb..e451ffcc73b5de2b911e1c6de54b42a5d1d54c37 100644
--- a/doc/fluid/design/modules/batch_norm_op.md
+++ b/doc/fluid/design/modules/batch_norm_op.md
@@ -2,7 +2,7 @@
 
 ## What is batch normalization
 
-Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training. 
+Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training.
 
 The principle of batch normalization can be summarized into a simple function:
 
@@ -66,7 +66,7 @@ As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attribu
 
 The following graph showes the training computational process of `batch_norm_op`:
 
-<img src="../images/batch_norm_op_kernel.png" width="800"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_op_kernel.png" width="800"/>
 
 cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
 
@@ -74,13 +74,13 @@ cudnn provides APIs to finish the whole series of computation, we can use them i
 
 `batch_norm_op` is warpped as a layer in Python:
 
-```python 
-def batch_norm_layer(net, 
+```python
+def batch_norm_layer(net,
                      input,
-                     output, 
-                     scale, 
-                     bias, 
-                     use_global_est = False, 
+                     output,
+                     scale,
+                     bias,
+                     use_global_est = False,
                      epsilon = 1e-6,
                      momentum = 0.99):
 	mean_cache = scope.new_var(name = 'estimated_mean', trainable = False)
@@ -119,15 +119,15 @@ for pass_id in range(PASS_NUM):
     if pass_id % 100 == 0:
         net.infer(test_image)    # run inferencing model
     # ...
-``` 
+```
 
 `is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
 
 <div align=center>
-<img src="../images/batch_norm_fork.png" width="500"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_fork.png" width="500"/>
 </div>
 
-Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate. 
+Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate.
 
 When the net runs in training mode, the end of the left branch will be set as the running target, so the dependency tracking process will ignore right branch automatically. When the net runs in inferencing mode, the process is reversed.
 
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
index 73f6d7b90c7dca0d48109cf3d28d5f7cd56b5c0b..f83ad3b6a4e8b4d82d8fe8d4154a2739a9b9628b 100644
--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@@ -2,12 +2,33 @@
 
 Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
 
-| Python classes | Protobuf messages |
-| --- | --- |
-| Program | ProgramDesc |
-| Block | BlockDesc |
-| Operator | OpDesc |
-| Variable | VarDesc |
+<table>
+<thead>
+<tr>
+<th>Python classes</th>
+<th>Protobuf messages</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Program </td>
+<td>ProgramDesc </td>
+</tr>
+<tr>
+<td>Block  </td>
+<td>BlockDesc </td>
+</tr>
+<tr>
+<td>Operator </td>
+<td>OpDesc </td>
+</tr>
+<tr>
+<td>Variable </td>
+<td>VarDesc </td>
+</tr>
+</tbody>
+</table>
+
 
 Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
 
diff --git a/doc/fluid/design/modules/regularization.md b/doc/fluid/design/modules/regularization.md
index 21280ac898feb4dd5e5a5d9e88d121e856850f0b..8cd5ff71d193f03e1ac923724b52f28c6057d25d 100644
--- a/doc/fluid/design/modules/regularization.md
+++ b/doc/fluid/design/modules/regularization.md
@@ -6,23 +6,23 @@ A central problem in machine learning is how to design an algorithm that will pe
 ### Parameter Norm Penalties
 Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
 
-<img src="./images/loss_equation.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/loss_equation.png" align="center"/><br/>
 
 The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
 
 The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
 
 ##### L2 Regularization:
-<img src="./images/l2_regularization.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l2_regularization.png" align="center"/><br/>
 
 ##### L1 Regularization
-<img src="./images/l1_regularization.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l1_regularization.png" align="center"/><br/>
 
 A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
 
 ## Regularization Survey
 
-A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey). 
+A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey).
 
 ## Proposal for Regularization in PaddlePaddle
 
@@ -32,41 +32,35 @@ In the new design, we propose to create new operations for regularization. For n
 - L2_regularization_op
 - L1_regularization_op
 
-These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties. 
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
 
-The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. 
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
 
 ### Computation Graph
 
 Below is an example of a really simple feed forward neural network.
 
-<img src="./images/feed_forward.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward.png" align="center"/><br/>
 
 The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
 
-<img src="./images/feed_forward_regularized.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward_regularized.png" align="center"/><br/>
    
 ### Python API implementation for Regularization
 
-Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. 
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
 
 #### Creation of Regularization ops
 There are two possibilities for creating the regularization ops:
-1. We create these ops immediately while building the computation graph. 
-2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. 
+1. We create these ops immediately while building the computation graph.
+2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added.
 
-The proposal is to add these ops in a lazy manner just before the backward pass. 
+The proposal is to add these ops in a lazy manner just before the backward pass.
 
 #### Storage of Regularization attributes
 
-Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. 
+Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters.
 
 #### High-level API
 
 In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
-
-
-
-
-
-    
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
index 110b7d78bf12ac8328fb3a913e4386e75d63c995..5e147f8263e685a4665b5793f7127178cbc3cfdd 100644
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@@ -10,11 +10,37 @@ Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution
 
 Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
 
-| Existed since | model as sequence of layers | model as graph of operators | No model |
-|--|--|--|--|
-| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
-| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
-| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+<table>
+<thead>
+<tr>
+<th>Existed since</th>
+<th>model as sequence of layers</th>
+<th>model as graph of operators</th>
+<th>No model</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>2013 </td>
+<td>Caffe, Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
+<td>2015 </td>
+<td> </td>
+<td>TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td>   </td>
+<td> PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid</td>
+</tr>
+</tbody>
+</table>
+
 
 From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
 
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
index 7c39fabcc6df76afdb6a77b4cbc2edf0bf3ef780..f199cc892f5e84f0a12abe3b8e5cace9849e7fa8 100644
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -36,11 +36,37 @@ At compile time, the Python program generates a protobuf message representation
 
 At runtime, the C++ program realizes the graph and runs it.
 
-| | Representation (protobuf messages) | Realization (C++ class objects) |
-|---|---|---|
-|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
-|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
-|Block|BlockDesc|Block|
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Representation (protobuf messages)</th>
+<th>Realization (C++ class objects) </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107">VarDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24">Variable</a></td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35">OpDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64">Operator</a></td>
+</tr>
+<tr>
+<td>Block </td>
+<td>BlockDesc </td>
+<td>Block </td>
+
+</tbody>
+</table>
+
 
 The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
 
diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md
index af0c6ef36feba9e0239e7a5f81a8dc9108b2471a..f32a5b7e8a4d820319a666dab4c3129360e2c924 100644
--- a/doc/fluid/design/network/deep_speech_2.md
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -1,4 +1,4 @@
-# DeepSpeech2 on PaddlePaddle: Design Doc 
+# DeepSpeech2 on PaddlePaddle: Design Doc
 
 We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
 
@@ -68,11 +68,33 @@ We roughly break down the project into 14 tasks:
 
 Tasks parallelizable within phases:
 
-Roadmap     | Description                               | Parallelizable Tasks 
------------ | :------------------------------------     | :--------------------
-Phase I	    | Simplified model & components             | *Task 1* ~ *Task 8*
-Phase II    | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
-Phase III   | Documentations                            | *Task13* ~ *Task14*
+<table>
+<thead>
+<tr>
+<th>Roadmap</th>
+<th>Description</th>
+<th> Parallelizable Tasks</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Phase I </td>
+<td>Simplified model & components </td>
+<td>Task 1 ~ Task 8</td>
+</tr>
+<tr>
+<td>Phase II </td>
+<td> Standard model & benchmarking & profiling</td>
+<td>Task 9 ~ Task 12 </td>
+</tr>
+<tr>
+<td>Phase III </td>
+<td> Documentations</td>
+<td> Task13 ~ Task14 </td>
+</tr>
+</tbody>
+</table>
+
 
 Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
 
@@ -94,7 +116,7 @@ The classical DS2 network contains 15 layers (from bottom to top):
 - **One** CTC-loss layer
 
 <div align="center">
-<img src="images/ds2_network.png" width=350><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ds2_network.png" width=350><br/>
 Figure 1. Archetecture of Deep Speech 2 Network.
 </div>
 
@@ -102,37 +124,82 @@ We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar
 
 Key ingredients about the layers:
 
-- **Data Layers**: 
+- **Data Layers**:
    - Frame sequences data of audio **spectrogram** (with FFT).
-   - Token sequences data of **transcription** text (labels). 
+   - Token sequences data of **transcription** text (labels).
    - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
-- **2D Convolution Layers**: 
+- **2D Convolution Layers**:
    - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
    - With striding for only the first convlution layer.
    - No pooling for all convolution layers.
-- **Uni-directional RNNs** 
+- **Uni-directional RNNs**
 	- Uni-directional + row convolution: for low-latency inference.
 	- Bi-direcitional + without row convolution: if we don't care about the inference latency.
 - **Row convolution**:
 	- For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
-	- Not nessesary if with bi-direcitional RNNs. 
+	- Not nessesary if with bi-direcitional RNNs.
 	- "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
 - **Batch Normalization Layers**:
    - Added to all above layers (except for data and loss layer).
    - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
- 
-
-Required Components                     | PaddlePaddle Support                      | Need to Develop
-:-------------------------------------  | :--------------------------------------   | :-----------------------
-Data Layer I (Spectrogram)	            | Not supported yet.                        |  TBD (Task 3)
-Data Layer II (Transcription)           | `paddle.data_type.integer_value_sequence` | -
-2D Convolution Layer                    | `paddle.layer.image_conv_layer`           | -
-DataType Converter (vec2seq)            | `paddle.layer.block_expand`               | -
-Bi-/Uni-directional RNNs                | `paddle.layer.recurrent_group`            | -
-Row Convolution Layer                   | Not supported yet.                        | TBD (Task 4)
-CTC-loss Layer                          | `paddle.layer.warp_ctc`                   | -
-Batch Normalization Layer               | `paddle.layer.batch_norm`                 | -
-CTC-Beam search                         | Not supported yet.                        | TBD (Task 6)
+
+<table>
+<thead>
+<tr>
+<th>Required Components</th>
+<th> PaddlePaddle Support</th>
+<th> Need to Develop</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data Layer I (Spectrogram) </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 3)</td>
+</tr>
+<tr>
+<td>Data Layer II (Transcription)  </td>
+<td> paddle.data_type.integer_value_sequence</td>
+<td> - </td>
+</tr>
+<tr>
+<td>2D Convolution Layer </td>
+<td> paddle.layer.image_conv_layer</td>
+<td> - </td>
+</tr>
+<tr>
+<td>DataType Converter (vec2seq)</td>
+<td> paddle.layer.block_expand</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Bi-/Uni-directional RNNs </td>
+<td>paddle.layer.recurrent_group</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Row Convolution Layer </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 4)</td>
+</tr>
+<tr>
+<td>CTC-loss Layer </td>
+<td>paddle.layer.warp_ctc</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Batch Normalization Layer </td>
+<td>paddle.layer.batch_norm</td>
+<td> - </td>
+</tr>
+<tr>
+<td>CTC-Beam search </td>
+<td>Not supported yet.</td>
+<td> TBD (Task 6) </td>
+</tr>
+</tbody>
+</table>
+
 
 ### Row Convolution
 
@@ -141,18 +208,18 @@ TODO by Assignees
 ### Beam Search with CTC and LM
 
 <div align="center">
-<img src="images/beam_search.png" width=600><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/beam_search.png" width=600><br/>
 Figure 2. Algorithm for CTC Beam Search Decoder.
 </div>
 
-- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: 
-   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
+   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
    - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
 - An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
 - Such external scorer consists of language model, word count or any other custom scorers.
 - The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
-- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. 
- 
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
+
 
 ## Future Work
 
diff --git a/doc/fluid/design/network/sequence_decoder.md b/doc/fluid/design/network/sequence_decoder.md
index c4a9bbeeefca0e05c335dd60233691e8bac33015..f13d30ca9fe09c9525c711436f605bb280e11000 100644
--- a/doc/fluid/design/network/sequence_decoder.md
+++ b/doc/fluid/design/network/sequence_decoder.md
@@ -199,7 +199,7 @@ Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail i
 
 ## LoD and shape changes during decoding
 <p align="center">
-  <img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg"/>
 </p>
 
 According to the image above, the only phase that changes the LoD is beam search.
diff --git a/doc/fluid/design/others/gan_api.md b/doc/fluid/design/others/gan_api.md
index fb41df8615f73d9fd4c32995eab265833eac1a55..7167470088766985fa5ad31657410309330fd725 100644
--- a/doc/fluid/design/others/gan_api.md
+++ b/doc/fluid/design/others/gan_api.md
@@ -1,24 +1,24 @@
 # Design for GAN
 
-GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. 
+GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas.
 
 It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.
 
 In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
 
 <p align="center">
-<img src="./test.dot.png" width = "35%" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/test.dot.png" width = "35%" align="center"/><br/>
 Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
 </p>
 
 The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
 
 <p align="center">
-<img src="./dcgan.png" width = "90%" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dcgan.png" width = "90%" align="center"/><br/>
 Figure 2. Photo borrowed from the original DC-GAN paper.
 </p>
 
-## The Conditional-GAN might be a class. 
+## The Conditional-GAN might be a class.
 This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:
 
 - DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
@@ -29,7 +29,7 @@ This design we adopt the popular open source design in https://github.com/carped
 Returns a generated image.
 
 - discriminator(image):
-Given an image, decide if it is from a real source or a fake one. 
+Given an image, decide if it is from a real source or a fake one.
 Returns a 0/1 binary label.
 
 - build_model(self):
@@ -47,7 +47,7 @@ To be more detailed, we introduce our design of DCGAN as following:
 ```python
 class DCGAN(object):
   def __init__(self, y_dim=None):
-  
+
     # hyper parameters  
     self.y_dim = y_dim # conditional gan or not
     self.batch_size = 100
@@ -82,18 +82,18 @@ class DCGAN(object):
     # input z: the random noise
     # input y: input data label (optional)
     # output G_im: generated fake images
-    
+
     if not self.y_dim:
       z = pd.layer.concat(1, [z, y])
-      
+
     G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
     G_h0_bn = pd.layer.batch_norm(G_h0)
     G_h0_relu = pd.layer.relu(G_h0_bn)
-    
+
     G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
     G_h1_bn = pd.layer.batch_norm(G_h1)
     G_h1_relu = pd.layer.relu(G_h1_bn)
-    
+
     G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
     G_im = pd.layer.tanh(G_im)
     return G_im
@@ -111,11 +111,11 @@ class DCGAN(object):
     D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
     D_h0_bn = pd.layer.batchnorm(h0)
     D_h0_relu = pd.layer.lrelu(h0_bn)
-    
+
     D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
     D_h1_bn = pd.layer.batchnorm(D_h1)
     D_h1_relu = pd.layer.lrelu(D_h1_bn)
-    
+
     D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
     return D_h2
 ```
@@ -123,7 +123,7 @@ class DCGAN(object):
 ### Class member function: Build the model
 - Define data readers as placeholders to hold the data;
 - Build generator and discriminators;
-- Define two training losses for discriminator and generator, respectively. 
+- Define two training losses for discriminator and generator, respectively.
 If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
 ```python
 class DCGAN(object):
@@ -133,7 +133,7 @@ class DCGAN(object):
     self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
     self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
     self.z = pd.data(tf.float32, [None, self.z_size])
-    
+
     # step 1: generate images by generator, classify real/fake images with discriminator
     if self.y_dim: # if conditional GAN, includes label
         self.G = self.generator(self.z, self.y)
@@ -147,12 +147,12 @@ class DCGAN(object):
         # generate fake images
         self.sampled = self.sampler(self.z)
         self.D_f = self.discriminator(self.images)
-    
+
     # step 2: define the two losses
     self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
     self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
     self.d_loss = self.d_loss_real + self.d_loss_fake
-    
+
     self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
 ```
 
@@ -176,7 +176,7 @@ class DCGAN(object):
         self.G = self.generator(self.z)
         self.D_g = self.discriminator(self.G, self.y)
       self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
-    
+
     with pd.default_block().d_block():
       if self.y_dim: # if conditional GAN, includes label
         self.D_t = self.discriminator(self.images, self.y)
@@ -217,7 +217,7 @@ if __name__ == "__main__":
 
     # load mnist data
     data_X, data_y = self.load_mnist()
-    
+
     # Two subgraphs required!!!
     with pd.block().d_block():
       d_optim = pd.train.Adam(lr = .001, beta= .1)
@@ -228,7 +228,7 @@ if __name__ == "__main__":
 
     # executor
     sess = pd.executor()
-    
+
     # training
     for epoch in xrange(10000):
       for batch_id in range(N / batch_size):
@@ -239,7 +239,7 @@ if __name__ == "__main__":
         batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])
 
         if batch_id % 2 == 0:
-          sess.run(d_step, 
+          sess.run(d_step,
                    feed_dict = {dcgan.images: batch_im,
                                 dcgan.y: batch_label,
                                 dcgan.z: batch_z})
diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..e57072d52fd162e92a3482aef33f99ab9394c532
--- /dev/null
+++ b/doc/fluid/dev/api_doc_std_en.md
@@ -0,0 +1,226 @@
+# API Doc Standard
+
+- [API Doc Structure](#API Doc Structure)
+- [Format and Examples](#Format and Examples)
+- [Complete Example](#Complete Example)
+
+
+## API Doc Structure
+
+API Doc should contain the following parts(please write them in order):
+
+- Python API Definition
+
+  The definition of API
+
+- Function Description
+
+  Description of API's function. 
+  The description includes: meaning, purpose and operation on input of API, reference and corresponding link(if any), formula(if necessary) and explanations of key variables in the formula.
+
+- Args Description
+
+  Description of API parameters.
+  Introduce parameters one by one according to the order in API definition.
+  The introduction includes: data type, default value(if any), meaning, etc.
+
+- Returns
+
+  Introduction of API returned value.
+  Introduce meaning of returned value, provide correspoding format if necessary.
+  If returned value is a tuple containing multiple parameters, then introduce parameters one by one in order.
+
+- Raises（if any）
+
+   Abnormality, error that may occur, and possible reasons. If there are more than one possible abnormity or error, they should be listed in order. 
+
+- Note（if any）
+
+  Matters needing attention. If there are more than one matters, they should be listed in order. 
+
+- Examples
+
+  Examples of how to use API.
+
+
+## Format and Examples
+
+API documentation must obey reStructuredText format, please refer to [here](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html).
+Format and examples of each part of API documantation are as follows: (take fc for example)
+
+- Python API Definition
+
+  - Format
+
+      [Python API Definition]
+
+  - Example
+
+      ```
+      fc(input,
+         size,
+         num_flatten_dims=1,
+         param_attr=None,
+         bias_attr=None,
+         act=None,
+         name=None,
+         main_program=None,
+         startup_program=None)
+      ```
+
+- Function Description
+
+  - Format
+
+      This part contains (please write them in order):
+
+      [Function Description]
+
+      [Formula]
+
+      [Symbols' Descriptions if necessary]
+
+      [References if necessary]
+
+  - Example
+
+      [Function Description]
+
+       ```
+       **Fully Connected Layer**
+
+       The fully connected layer can take multiple tensors as its inputs. It
+       creates a variable called weights for each input tensor, which represents
+       a fully connected weight matrix from each input unit to each output unit.
+       The fully connected layer multiplies each input tensor with its coresponding
+       weight to produce an output Tensor. If multiple input tensors are given,
+       the results of multiple multiplications will be sumed up. If bias_attr is
+       not None, a bias variable will be created and added to the output. Finally,
+       if activation is not None, it will be applied to the output as well.
+       ```
+
+      [Formula]
+
+      ```
+      This process can be formulated as follows:
+
+      .. math::
+
+           Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+      ```
+
+      [Symbols' Descriptions if necessary]
+
+      ```
+      In the above equation:
+
+      * :math:`N`: Number of the input.
+      * :math:`X_i`: The input tensor.
+      * :math:`W`: The weights created by this layer.
+      * :math:`b`: The bias parameter created by this layer (if needed).
+      * :math:`Act`: The activation function.
+      * :math:`Out`: The output tensor.
+      ```
+
+      [References if necessary]
+
+      Since there is no need for reference of fc, we omit them here. Under other circumstances, please provide explicit reference and link, take layer_norm for example: 
+
+      ```
+      Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
+      ```
+
+
+- Args Description
+
+  - Format
+
+      \[Arg's Name\][(Data Type, Default Value)][Description]
+
+  - Example
+
+      part of fc parameters are as follows:
+
+      ```
+      Args:
+          input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+              the input tensor(s) is at least 2.
+          param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+              parameters/weights of this layer.
+          name (str, default None): The name of this layer.
+      ```
+
+- Returns
+
+  - Format
+
+      [Name][Shape]
+
+  - Example
+
+      ```
+      Returns:
+          A tensor variable storing the transformation result.
+      ```
+
+      when returned value contain more than one tuple, please introduce every parameter in order, take dynamic_lstm for example:
+
+      ```
+      Returns:
+          A tuple containing:
+            The hidden state of LSTM whose shape is (T X D).
+            The cell state of LSTM whose shape is (T X D).
+      ```
+
+- Raises
+
+  - Format
+
+      [Exception Type][Condition]
+
+  - Example
+
+      ```
+      Raises:
+          ValueError: If the rank of the input is less than 2.
+      ```
+
+- Note
+
+  - Format
+
+     [Note]
+
+  - Example
+
+      there is no Note in fc, so we omit this part. If there is any note, please write clearly. If there are more than one notes, please list them in order. Take scaled\_dot\_product\_attention for example:
+
+      ```
+      Note:
+          1. When num_heads > 1, three linear projections are learned respectively
+             to map input queries, keys and values into queries', keys' and values'.
+             queries', keys' and values' have the same shapes with queries, keys
+             and values.
+          2. When num_heads == 1, scaled_dot_product_attention has no learnable
+             parameters.
+      ```
+
+- Examples
+
+  - Format
+
+      \[Python Code Snipper]
+
+  - Example
+
+      ```
+      Examples:
+          .. code-block:: python
+
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+      ```
+
+## Complete Example
+
+Complete Example of fc please see [here](src/fc.py)。
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index e70bf5dff3849f2ff82315f7eba4a92c93539843..b123b756e2251c38f319e1aefa2cb04fd7a36b03 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,10 +4,10 @@
 .. toctree::
   :maxdepth: 1
 
-  new_op_en.md
-  new_op_kernel_en.md
-  use_eigen_en.md
+  new_op_cn.md
+  new_op_kernel.md
+  use_eigen_cn.md
   name_convention.md
   support_new_device.md
-  releasing_process.md
+  releasing_process_cn.md
   op_markdown_format.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index f0e9afcfcc9edfb9a91f58375cd415ea414f8f82..98988fc22dcedecdbcd67fb3bf761377bf046337 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -5,9 +5,9 @@ Development
   :maxdepth: 1
 
   new_op_en.md
-  new_op_kernel_en.md
+  new_op_kernel.md
   use_eigen_en.md
   name_convention.md
   support_new_device.md
-  releasing_process.md
+  releasing_process_en.md
   op_markdown_format.md
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 92996585674b46f45549b972b9f295503b1c7f8c..0c3f88d9c31e05bec399c64bf6ade56e62e01f68 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -26,13 +26,32 @@
 
 依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorWithKernel`，后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
 
-
- 内容            | 定义位置
---------------  | :----------------------
-OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
-Op定义           | `.cc`文件
-Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
-注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake定义 </td>
+<td>`.cc`文件，Backward Op不需要定义OpProtoMake </td>
+</tr>
+<tr>
+<td>Op定义 </td>
+<td> `.cc`文件</td>
+</tr>
+<tr>
+<td>Kernel实现 </td>
+<td> CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。</td>
+</tr>
+<tr>
+<td>注册Op </td>
+<td> Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中</td>
+</tr>
+</tbody>
+</table>
 
 
 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
index da8b1bdd1082e439456daf25e9b3a1e8eb534375..a566a09131f86251b70d5435d0a483aa2a705b35 100644
--- a/doc/fluid/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
@@ -33,6 +33,33 @@ Op definition           | `.cc` files
 Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
 Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
 
+<table>
+<thead>
+<tr>
+<th>Information</th>
+<th> Where is it defined</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake definition </td>
+<td> `.cc`files, Backward Op does not need an OpProtoMake interface. </td>
+</tr>
+<tr>
+<td>Op definition  </td>
+<td> `.cc` files</td>
+</tr>
+<tr>
+<td>Kernel implementation  </td>
+<td> The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.</td>
+</tr>
+<tr>
+<td>Registering the Op  </td>
+<td> Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.</td>
+</tr>
+</tbody>
+</table>
+
 
 New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
 
@@ -279,7 +306,7 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass
 
       def test_check_output(self):
           self.check_output()
-          
+
       def test_check_grad_normal(self):
           self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
diff --git a/doc/fluid/dev/new_op_kernel_en.md b/doc/fluid/dev/new_op_kernel.md
similarity index 100%
rename from doc/fluid/dev/new_op_kernel_en.md
rename to doc/fluid/dev/new_op_kernel.md
diff --git a/doc/fluid/dev/releasing_process.md b/doc/fluid/dev/releasing_process_cn.md
similarity index 58%
rename from doc/fluid/dev/releasing_process.md
rename to doc/fluid/dev/releasing_process_cn.md
index b9787261092f1f27377886152cb1596d9ff54188..4c6728fba7150b0f1e180e57590f18a5b677c70d 100644
--- a/doc/fluid/dev/releasing_process.md
+++ b/doc/fluid/dev/releasing_process_cn.md
@@ -10,19 +10,10 @@ PaddlePaddle每次发新的版本，遵循以下流程:
   * 使用Regression Test List作为检查列表，测试本次release的正确性。
 	  * 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，到第二步
 	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
-	* 编译这个版本的python wheel包，并发布到pypi。
-		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
-		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu，如果要上传GPU版本的包，需要修改build/python/setup.py中，name: "paddlepaddle_gpu"并重新打包wheel包：`python setup.py bdist_wheel`。
-		* 上传方法：
-			```
-			cd build/python
-			pip install twine
-			twine upload dist/[package to upload]
-			```
-		* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
-1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-1. 协同完成Release Note的书写
-
+	* 将这个版本的python wheel包发布到pypi。
+	* 更新Docker镜像（参考后面的操作细节）。
+1. 第三步完成后，将`release/版本号`分支合入master分支，将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。
+1. 协同完成Release Note的书写。
 
 需要注意的是:
 
@@ -31,13 +22,18 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 
 ## 发布wheel包到pypi
 
-使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
 完成自动化二进制编译，参考下图，选择需要发布的版本（通常包含一个CPU版本和一个GPU版本），点击"run"右侧的"..."按钮，可以
-弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。等待编译完成后
-可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。然后按照上述的方法
-使用`twine`工具上传即可。
-
-<img src="ci_build_whl.png">
+弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。
+	<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。
+1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+1. 上传：
+```
+cd build/python
+pip install twine
+twine upload dist/[package to upload]
+```
 
 * 注：CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
   发型版，如果需要手动编译，也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
@@ -48,10 +44,20 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub，所以，发布Docker镜像只需要对自动push的镜像打上
 版本号对应的tag即可：
 
-1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
-1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`，latest tag可以是latest或latest-gpu等。
-1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
-1. 执行 `docker push paddlepaddle/paddle:[version]`
+```
+docker pull [镜像]:latest
+docker tag [镜像]:latest [镜像]:[version]
+docker push [镜像]:[version]
+```
+
+需要更新的镜像tag包括：
+
+* `[version]`: CPU版本
+* `[version]-openblas`: openblas版本
+* `[version]-gpu`: GPU版本（CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像
+
+之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。
 
 ## PaddlePaddle 分支规范
 
@@ -66,7 +72,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
 	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的功能分支。
 	* 当功能分支开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
-		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。 
+		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。
 
 * BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
 
@@ -76,15 +82,118 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 
 ### PaddlePaddle Book中所有章节
 
-PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
-
-| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| API.V2 + Docker + GPU  |  |  |  |  |  |  |  |  |
-| API.V2 + Docker + CPU  |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Docker + GPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Docker + CPU |  |  |  |  |  |  |  |  |
-| API.V2 + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-| API.V2 + Ubuntu + CPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Ubuntu + CPU |  |  |  |  |  |  |  |  |
+PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练（V2和Fluid）模型正确性。
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>新手入门章节 </th>
+<th> 识别数字</th>
+<th> 图像分类</th>
+<th>词向量</th>
+<th> 情感分析</th>
+<th>语意角色标注</th>
+<th> 机器翻译</th>
+<th>个性化推荐</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f989b964d6d1a329bbe31adc7ec10db017acaefa
--- /dev/null
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -0,0 +1,210 @@
+# PaddlePaddle Releasing Process
+
+PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
+
+Each time we release a new PaddlePaddle version, we should follow the below steps:
+
+1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
+1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
+   first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
+1. After that, we should do:
+  * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
+      that this release has no major bugs.
+        * If regression test fails, we must fix those bugs and create a new `release/[version]`
+          branch from previous release branch.
+    * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
+    * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
+    * Update the Docker images (see below instructions for detail).
+1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
+   then merge `master` to `develop`.
+1. Update the Release Note.          
+
+***NOTE:***
+
+* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
+  features only for current release, so that we can test on that version.
+* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
+
+## Publish Wheel Packages to pypi
+
+1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+   to build all wheel packages needed to publish. As shown in the following picture, choose a build
+     version, click "..." button on the right side of "Run" button, and switch to the second tab in the
+pop-up box, choose the current release branch and click "Run Build" button. You may repeat this
+     step to start different versions of builds.
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`.
+1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we
+     upload the package using `twine`, we need to rename the package from `linux_x86_64` to
+     `manylinux1_x86_64`.
+1. Start the upload:
+     ```
+     cd build/python
+     pip install twine
+     twine upload dist/[package to upload]
+     ```
+
+* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can
+  download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using
+    scripts under `tools/manylinux1`.
+* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
+  old version. you must change the version number before upload a new one.
+
+## Publish Docker Images
+
+Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
+
+```
+docker pull [image]:latest
+docker tag [image]:latest [image]:[version]
+docker push [image]:[version]
+```
+
+Tags that need to be updated are:
+* `[version]`: CPU only version image
+* `[version]-openblas`: openblas version image
+* `[version]-gpu`: GPU version（using CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions
+
+You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
+
+## Branching Model
+
+We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
+with some modifications:
+
+* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
+* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
+  regression tests are run.
+* `release/[version]` branch is used to publish each release. Latest release version branches have
+  bugfix only for that version, but no feature updates.
+* Developer forks are not required to follow
+  [git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
+  branching model, all forks is like a feature branch.
+    * Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
+    * Advise: developer use it's fork's develop branch to for new branch to start developing.
+  * Use that branch on developer's fork to create pull requests and start reviews.
+      * developer can push new commits to that branch when the pull request is open.
+* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
+  `master`, `develop` and `releases`.
+
+## PaddlePaddle Regression Test List
+
+### All Chapters of PaddlePaddle Book
+
+We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
+V1 (`paddle_trainer` training) and V2 training and Fluid training.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Linear Regression</th>
+<th>Recognize Digits</th>
+<th>Image Classification</th>
+<th>Word2Vec</th>
+<th>Personalized Recommendation</th>
+<th>Sentiment Analysis</th>
+<th>Semantic Role Labeling</th>
+<th>Machine Translation</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
diff --git a/doc/fluid/getstarted/concepts/save_model/model_format.md b/doc/fluid/getstarted/concepts/save_model/model_format.md
index e29129fddf775939c9f7a8b49d850d523e6e5a45..1f12ba0497369eacc6a2db7984781b5672f45ea1 100644
--- a/doc/fluid/getstarted/concepts/save_model/model_format.md
+++ b/doc/fluid/getstarted/concepts/save_model/model_format.md
@@ -4,30 +4,70 @@
 
 A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code.
 
-As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. 
+As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters.
 
 ## Implementation
 
-The topology is saved as a plain text in a detailed self-contain protobuf file. 
+The topology is saved as a plain text in a detailed self-contain protobuf file.
 
 The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task.
 
-As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, 
+As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is,
 
 The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
 
-|field name  | type | description |
-| --- | --- | --- |
-| version | uint32_t | Version of saved file. Always 0 now. |
-| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. |
-| tensor desc | void* | TensorDesc protobuf binary message |
-| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` |
-| lod_level | uint64_t | Level of LoD |
-| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. |
-| data of lod[0] | uint64_t*  | [Optional] lod[0].data() |
-| ... | ... | ... |
-
+<table>
+<thead>
+<tr>
+<th>field name</th>
+<th>type </th>
+<th>description </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> version</td>
+<td> uint32_t </td>
+<td> Version of saved file. Always 0 now.</td>
+</tr>
 
+<tr>
+<td> tensor desc length  </td>
+<td> uint32_t </td>
+<td> TensorDesc(Protobuf message) length in bytes. </td>
+</tr>
+<tr>
+<td>tensor desc </td>
+<td> void*</td>
+<td> TensorDesc protobuf binary message </td>
+</tr>
+<tr>
+<td> tensor data </td>
+<td> void* </td>
+<td> Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` </td>
+</tr>
+<tr>
+<td> lod_level</td>
+<td> uint64_t </td>
+<td> Level of LoD </td>
+</tr>
+<tr>
+<td> length of lod[0] </td>
+<td> uint64_t </td>
+<td> [Optional] length of lod[0] in bytes. </td>
+</tr>
+<tr>
+<td> data of lod[0] </td>
+<td> uint64_t*   </td>
+<td> [Optional] lod[0].data() </td>
+</tr>
+<tr>
+<td>... </td>
+<td> ... </td>
+<td> ... </td>
+</tr>
+</tbody>
+</table>
 
 ## Summary
 
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
index 1b6f767869aaa800c122c8e7a06a1413e48e10e0..b99b90056b0a2e51f2668a6d27d94857bdc09c37 100644
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -65,10 +65,10 @@ exit(1)
 
 **因此，在分布式的Fluid环境中，我们有两个角色需要创建，分别是Parameter Server和Trainer。**
 
-### 分布式训练 
+### 分布式训练
 Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数，将他们分隔为两部分，通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
 ```python
-optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) 
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
 ```
 将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下：
 ```python
@@ -99,15 +99,51 @@ for pass_id in range(100):
 ### 分布式训练脚本运行说明
 分布式任务的运行需要将表格中说明的多个参数进行赋值:
 
-| 参数名 | 值类型 | 说明 | 示例 |
-|:-------------|:------|:---------------------------------------|:-------------|
-| trainer_id | int | 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 | 0/1/2/3 |
-| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 |
-| trainers | int | 训练节点的总个数，>0的数字 | 4 |
-| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 |
-| training_role | str | 节点角色， TRAINER/PSERVER | PSERVER |
-
-**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下： 
+<table>
+<thead>
+<tr>
+<th>参数名</th>
+<th> 值类型</th>
+<th>说明</th>
+<th> 示例</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>trainer_id </td>
+<td> int</td>
+<td> 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 </td>
+<td> 0/1/2/3  </td>
+</tr>
+<tr>
+<td>pservers </td>
+<td> str</td>
+<td> parameter server 列表 </td>
+<td> 127.0.0.1:6710,127.0.0.1:6711 </td>
+</tr>
+<tr>
+<td>trainers </td>
+<td>int </td>
+<td> 训练节点的总个数，>0的数字 </td>
+<td> 4 </td>
+</tr>
+<tr>
+<td> server_endpoint</td>
+<td> str </td>
+<td> 当前所起的服务节点的IP:PORT </td>
+<td> 127.0.0.1:8789 </td>
+</tr>
+<tr>
+<td> training_role</td>
+<td>str </td>
+<td> 节点角色， TRAINER/PSERVER </td>
+<td> PSERVER </td>
+</tr>
+</tbody>
+</table>
+
+
+**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下：
 
 ```python
 t = fluid.DistributeTranspiler()
diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md
index 17f895573a65731db34b2addddaa22e7f32157ec..8266dec3c6125a09b90ac0ccd4aa5464f5c7db31 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -42,14 +42,40 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 
 每一列的含义是:
 
-| 列名 | 含义 |
-| --- | --- |
-| ncalls | 函数的调用次数 |
-| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
-| percall | tottime的每次调用平均时间 |
-| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
-| percall | cumtime的每次调用平均时间 |
-| filename:lineno(function) | 文件名, 行号，函数名 |
+<table>
+<thead>
+<tr>
+<th>列名</th>
+<th>含义 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> 函数的调用次数</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> 函数总时间。包含这个函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> 文件名, 行号，函数名 </td>
+</tr>
+</tbody>
+</table>
 
 
 ### 寻找性能瓶颈
diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
index abe4493c175fb4ee57f1acf45931e2890620d9c1..e95556dd608b7ff0a3eb18873df0015a2da94e7c 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -57,14 +57,40 @@ port, we will see the output like the following:
 where each line corresponds to Python function, and the meaning of
 each column is as follows:
 
-| column | meaning |
-| --- | --- |
-| ncalls | the number of calls into a function |
-| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
-| percall | tottime divided by ncalls |
-| cumtime | the total execution time of the function, including the execution time of other functions being called |
-| percall | cumtime divided by ncalls |
-| filename:lineno(function) | where the function is defined |
+<table>
+<thead>
+<tr>
+<th>column</th>
+<th>meaning </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> the number of calls into a function</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> the total execution time of the function, not including the execution time of other functions called by the function</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime divided by ncalls</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> the total execution time of the function, including the execution time of other functions being called</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime divided by ncalls</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> where the function is define </td>
+</tr>
+</tbody>
+</table>
 
 ### Identify Performance Bottlenecks
 
diff --git a/doc/fluid/howto/performance/profiler.md b/doc/fluid/howto/performance/profiler.md
index b20b5efdc1f1f10ce7cec835adcc6fb374ed4e20..ee96e7c74ce317caddb387cbb1d4998937bd5c81 100644
--- a/doc/fluid/howto/performance/profiler.md
+++ b/doc/fluid/howto/performance/profiler.md
@@ -23,7 +23,7 @@ But how to record the time for the mixed C++ and CUDA program?  There many C++ A
 
 The overall flow is shown as the following figure.
 
-<img src="./images/profiler.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/profiler.png" align="center"/><br/>
 
 ### Event
 
@@ -36,10 +36,10 @@ enum EventKind {
   kPopRange};
 ```
 - kMark: only a marker without time range.
-- kPushRange: mark the starting event for time range. 
+- kPushRange: mark the starting event for time range.
 - kPopRange: mark the ending event for time range.
 
-For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used.  For many pieces of code, an event lists are used to record each piece. 
+For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used.  For many pieces of code, an event lists are used to record each piece.
 
 ```c++
 class Event {
@@ -66,11 +66,11 @@ struct EventList {
 };
 ```
 
-As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler. 
+As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler.
 
 ```c++
 enum ProfilerState {
-  kDisabled, 
+  kDisabled,
   kCPU,
   kCUDA
 };
diff --git a/doc/fluid/images/2_level_rnn.dot b/doc/fluid/images/2_level_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..5d77865061ca7bbbfcf254dd938f09aef5553505
--- /dev/null
+++ b/doc/fluid/images/2_level_rnn.dot
@@ -0,0 +1,56 @@
+digraph G {
+
+  rnn [label="1st level RNN" shape=box]
+
+  subgraph cluster0 {
+    label = "time step 0"
+
+    sent0 [label="sentence"]
+    sent1 [label="sentence"]
+
+    rnn1 [label="2nd level RNN" shape=box]
+
+    sent0 -> rnn1
+    sent1 -> rnn1
+  }
+
+  subgraph cluster1 {
+    label = "time step 1"
+
+    sent2 [label="sentence"]
+    sent3 [label="sentence"]
+
+    rnn2 [label="2nd level RNN" shape=box]
+
+    sent2 -> rnn2
+    sent3 -> rnn2
+  }
+
+  subgraph cluster2 {
+    label = "time step 2"
+
+    sent4 [label="sentence"]
+    sent5 [label="sentence"]
+
+    rnn3 [label="2nd level RNN" shape=box]
+
+    sent4 -> rnn3
+    sent5 -> rnn3
+  }
+
+
+  para0 [label="paragraph info 0"]
+  para1 [label="paragraph info 1"]
+  para2 [label="paragraph info 2"]
+
+  rnn1 -> para0
+  rnn2 -> para1
+  rnn3 -> para2
+
+  para0 -> rnn
+  para1 -> rnn
+  para2 -> rnn
+
+  chapter [label="chapter info"]
+  rnn -> chapter
+}
diff --git a/doc/fluid/images/2_level_rnn.png b/doc/fluid/images/2_level_rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..0537a75beb175c0c284717421f7aa908da2a5038
Binary files /dev/null and b/doc/fluid/images/2_level_rnn.png differ
diff --git a/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455
Binary files /dev/null and b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/fluid/images/asgd.gif b/doc/fluid/images/asgd.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4a0da7bf6df9326a2aab1638b77c5455c18b8c4e
Binary files /dev/null and b/doc/fluid/images/asgd.gif differ
diff --git a/doc/fluid/images/batch_norm_fork.dot b/doc/fluid/images/batch_norm_fork.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4bc47713cba2cb23f1b34fffe6426ef10ac3a9df
--- /dev/null
+++ b/doc/fluid/images/batch_norm_fork.dot
@@ -0,0 +1,25 @@
+digraph ImageBatchNormForkGragh {
+  subgraph cluster_before {
+    Prev [label="...", shape=plaintext];
+    Rnn [label="rnn_op", shape=box];
+    BatchNorm [label="batch_norm_op", shape=box];
+    Fc [label="fc_op", shape=box];
+    After [label="...", shape=plaintext];
+    Prev -> Rnn -> BatchNorm -> Fc -> After;
+    label="original";
+  }
+
+  subgraph cluster_after {
+    Prev2 [label="...", shape=plaintext];
+    Rnn2 [label="rnn_op", shape=box];
+    BatchNorm2_1 [label="train_batch_norm_op", shape=box];
+    BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
+    Fc2_1 [label="fc_op", shape=box];
+    Fc2_2 [label="fc_op", shape=box];
+    After2_1 [label="...", shape=plaintext];
+    After2_2 [label="...", shape=plaintext];
+    Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
+    Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
+    label="forked";
+  }
+}
diff --git a/doc/fluid/images/batch_norm_fork.png b/doc/fluid/images/batch_norm_fork.png
new file mode 100644
index 0000000000000000000000000000000000000000..aded62bce5bc268b7a3ef4dc96c89fe21d6ea955
Binary files /dev/null and b/doc/fluid/images/batch_norm_fork.png differ
diff --git a/doc/fluid/images/batch_norm_op_kernel.png b/doc/fluid/images/batch_norm_op_kernel.png
new file mode 100644
index 0000000000000000000000000000000000000000..a99ce81ff3bf42880ebbd6a1297de3bf038e09b2
Binary files /dev/null and b/doc/fluid/images/batch_norm_op_kernel.png differ
diff --git a/doc/fluid/images/beam_search.png b/doc/fluid/images/beam_search.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae
Binary files /dev/null and b/doc/fluid/images/beam_search.png differ
diff --git a/doc/fluid/images/ci_build_whl.png b/doc/fluid/images/ci_build_whl.png
new file mode 100644
index 0000000000000000000000000000000000000000..232762b82a9ae3e979a1f38a7beb715c87438f40
Binary files /dev/null and b/doc/fluid/images/ci_build_whl.png differ
diff --git a/doc/fluid/images/compiler.graffle b/doc/fluid/images/compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..8cc678fea3c820103e7ce81f7a5d625d6c1d92de
Binary files /dev/null and b/doc/fluid/images/compiler.graffle differ
diff --git a/doc/fluid/images/compiler.png b/doc/fluid/images/compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..65d34f841afce9756def07dd8ecb9ca44e658bfe
Binary files /dev/null and b/doc/fluid/images/compiler.png differ
diff --git a/doc/fluid/images/control_flow_graph.png b/doc/fluid/images/control_flow_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3579998e58d07abc50bd3332128d4733a391cb3b
Binary files /dev/null and b/doc/fluid/images/control_flow_graph.png differ
diff --git a/doc/fluid/images/dataflow_equations.png b/doc/fluid/images/dataflow_equations.png
new file mode 100644
index 0000000000000000000000000000000000000000..c10f7f69f4007952e5b0394edaa04efa1cfbb658
Binary files /dev/null and b/doc/fluid/images/dataflow_equations.png differ
diff --git a/doc/fluid/images/dcgan.png b/doc/fluid/images/dcgan.png
new file mode 100644
index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28
Binary files /dev/null and b/doc/fluid/images/dcgan.png differ
diff --git a/doc/fluid/images/deep_learning.png b/doc/fluid/images/deep_learning.png
new file mode 100644
index 0000000000000000000000000000000000000000..026becc4d94e01e407dacb2a5314a0e5723334ff
Binary files /dev/null and b/doc/fluid/images/deep_learning.png differ
diff --git a/doc/fluid/images/dist-graph.graffle b/doc/fluid/images/dist-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..941399c6ced8d5f65b6c595522b770c88259df4b
Binary files /dev/null and b/doc/fluid/images/dist-graph.graffle differ
diff --git a/doc/fluid/images/dist-graph.png b/doc/fluid/images/dist-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3546b09f1c2ee3e4f60f519d5e47f823f08051a7
Binary files /dev/null and b/doc/fluid/images/dist-graph.png differ
diff --git a/doc/fluid/images/distributed_architecture.graffle b/doc/fluid/images/distributed_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d1b60141342232e06227c2d430ebc60ec349a907
Binary files /dev/null and b/doc/fluid/images/distributed_architecture.graffle differ
diff --git a/doc/fluid/images/distributed_architecture.png b/doc/fluid/images/distributed_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356
Binary files /dev/null and b/doc/fluid/images/distributed_architecture.png differ
diff --git a/doc/fluid/images/ds2_network.png b/doc/fluid/images/ds2_network.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11
Binary files /dev/null and b/doc/fluid/images/ds2_network.png differ
diff --git a/doc/fluid/images/feed_forward.png b/doc/fluid/images/feed_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..d312371a04c26aa6cd196e0bd1f51becb425180b
Binary files /dev/null and b/doc/fluid/images/feed_forward.png differ
diff --git a/doc/fluid/images/feed_forward_regularized.png b/doc/fluid/images/feed_forward_regularized.png
new file mode 100644
index 0000000000000000000000000000000000000000..677e99bfd9f8e72ed9fe4b27127af2ced202f447
Binary files /dev/null and b/doc/fluid/images/feed_forward_regularized.png differ
diff --git a/doc/fluid/images/fluid-compiler.graffle b/doc/fluid/images/fluid-compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c933df2cb855462c52b2d25f7f9a99b95652961d
Binary files /dev/null and b/doc/fluid/images/fluid-compiler.graffle differ
diff --git a/doc/fluid/images/fluid-compiler.png b/doc/fluid/images/fluid-compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b0ffed2039c91a3a00bbb719da08c91c3acf7bb
Binary files /dev/null and b/doc/fluid/images/fluid-compiler.png differ
diff --git a/doc/fluid/images/graph_construction_example.bash b/doc/fluid/images/graph_construction_example.bash
new file mode 100755
index 0000000000000000000000000000000000000000..35e6997abd17588e17a82d448918fc1b3bd7220e
--- /dev/null
+++ b/doc/fluid/images/graph_construction_example.bash
@@ -0,0 +1,11 @@
+cat ./graph_construction_example.dot | \
+    sed 's/color=red/color=red, style=invis/g' | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_only.png
+
+cat ./graph_construction_example.dot | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_backward.png
+
+cat ./graph_construction_example.dot | \
+    dot -Tpng > graph_construction_example_all.png
diff --git a/doc/fluid/images/graph_construction_example.dot b/doc/fluid/images/graph_construction_example.dot
new file mode 100644
index 0000000000000000000000000000000000000000..e115f9844bae6ad24f638c8ed4749cea8aff06a9
--- /dev/null
+++ b/doc/fluid/images/graph_construction_example.dot
@@ -0,0 +1,68 @@
+digraph ImageClassificationGraph {
+        ///////// The forward part /////////
+        FeedX [label="Feed", color=blue, shape=box];
+        FeedY [label="Feed", color=blue, shape=box];
+        InitW [label="Init", color=blue, shape=diamond];
+        Initb [label="Init", color=blue, shape=diamond];
+        FC [label="FC", color=blue, shape=box];
+        MSE [label="MSE", color=blue, shape=box];
+
+        x [label="x", color=blue, shape=oval];
+        l [label="l", color=blue, shape=oval];
+        y [label="y", color=blue, shape=oval];
+        W [label="W", color=blue, shape=doublecircle];
+        b [label="b", color=blue, shape=doublecircle];
+        cost [label="cost", color=blue, shape=oval];
+
+        FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
+        FeedY -> l [color=blue];
+        InitW -> W [color=blue];
+        Initb -> b [color=blue];
+        W -> FC [color=blue];
+        b -> FC [color=blue];
+        l -> MSE [color=blue];
+
+        ////////// The backward part /////////
+        MSE_Grad [label="MSE_grad", color=red, shape=box];
+        FC_Grad [label="FC_grad", color=red, shape=box];
+
+        d_cost [label="d cost", color=red, shape=oval];
+        d_y [label="d y", color=red, shape=oval];
+        d_b [label="d b", color=red, shape=oval];
+        d_W [label="d W", color=red, shape=oval];
+
+        cost -> MSE_Grad [color=red];
+        d_cost -> MSE_Grad [color=red];
+        l -> MSE_Grad [color=red];
+        y -> MSE_Grad -> d_y [color=red];
+
+        x -> FC_Grad [color=red];
+        y -> FC_Grad [color=red];
+        d_y -> FC_Grad [color=red];
+        W -> FC_Grad -> d_W [color=red];
+        b -> FC_Grad -> d_b [color=red];
+
+        ////////// The optimizaiton part //////////
+
+        OPT_W [label="SGD", color=green, shape=box];
+        OPT_b [label="SGD", color=green, shape=box];
+
+        W -> OPT_W [color=green];
+        b -> OPT_b [color=green];
+        d_W -> OPT_W -> W [color=green];
+        d_b -> OPT_b -> b [color=green];
+
+        ////////// Groupings //////////
+
+        subgraph clusterMSE {
+                style=invis;
+                MSE;
+                MSE_Grad;
+        }
+
+        subgraph clusterFC {
+                style=invis;
+                FC;
+                FC_Grad;
+        }
+}
diff --git a/doc/fluid/images/graph_construction_example_all.png b/doc/fluid/images/graph_construction_example_all.png
new file mode 100644
index 0000000000000000000000000000000000000000..261611a5721f9aa97874f7e6d897fe48cf667db2
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_all.png differ
diff --git a/doc/fluid/images/graph_construction_example_forward_backward.png b/doc/fluid/images/graph_construction_example_forward_backward.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c69687f4a6a181138f3df72ce5e8aa48487b5be
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/fluid/images/graph_construction_example_forward_only.png b/doc/fluid/images/graph_construction_example_forward_only.png
new file mode 100644
index 0000000000000000000000000000000000000000..e668c16e0cac73acb4e5dc2b1827557ae77126b4
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_forward_only.png differ
diff --git a/doc/fluid/images/l1_regularization.png b/doc/fluid/images/l1_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1b9c7a44f94dc027598a98da93ddb8133190972
Binary files /dev/null and b/doc/fluid/images/l1_regularization.png differ
diff --git a/doc/fluid/images/l2_regularization.png b/doc/fluid/images/l2_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..d5c2fcbc2ccae75ad083162e5a2dceb0210be298
Binary files /dev/null and b/doc/fluid/images/l2_regularization.png differ
diff --git a/doc/fluid/images/local-graph.graffle b/doc/fluid/images/local-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..19e509bd9af3c1e9a3f5e0f16ddd281457a339c5
Binary files /dev/null and b/doc/fluid/images/local-graph.graffle differ
diff --git a/doc/fluid/images/local-graph.png b/doc/fluid/images/local-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..ada51200f793a9bb18911e7d63cfdb3244b967d7
Binary files /dev/null and b/doc/fluid/images/local-graph.png differ
diff --git a/doc/fluid/images/local_architecture.graffle b/doc/fluid/images/local_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..49fcc663ebe3824aa234e3a67aadf285cb417877
Binary files /dev/null and b/doc/fluid/images/local_architecture.graffle differ
diff --git a/doc/fluid/images/local_architecture.png b/doc/fluid/images/local_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122
Binary files /dev/null and b/doc/fluid/images/local_architecture.png differ
diff --git a/doc/fluid/images/lookup_table.png b/doc/fluid/images/lookup_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..72dfe3547f731d0d090338afb206b0549dff472e
Binary files /dev/null and b/doc/fluid/images/lookup_table.png differ
diff --git a/doc/fluid/images/lookup_table_training.png b/doc/fluid/images/lookup_table_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e
Binary files /dev/null and b/doc/fluid/images/lookup_table_training.png differ
diff --git a/doc/fluid/images/loss_equation.png b/doc/fluid/images/loss_equation.png
new file mode 100644
index 0000000000000000000000000000000000000000..14212ec8d36c803de96bde8a9a4b5591bd20434e
Binary files /dev/null and b/doc/fluid/images/loss_equation.png differ
diff --git a/doc/fluid/images/multi-threads.graffle b/doc/fluid/images/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/fluid/images/multi-threads.graffle differ
diff --git a/doc/fluid/images/multi-threads@3x.png b/doc/fluid/images/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/fluid/images/multi-threads@3x.png differ
diff --git a/doc/fluid/images/multigpu_allreduce.graffle b/doc/fluid/images/multigpu_allreduce.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cb5bc420ceafe8ba4c87694d44ee4e5e4ad06779
Binary files /dev/null and b/doc/fluid/images/multigpu_allreduce.graffle differ
diff --git a/doc/fluid/images/multigpu_allreduce.png b/doc/fluid/images/multigpu_allreduce.png
new file mode 100644
index 0000000000000000000000000000000000000000..87a1b3e8f6dd4a713ec9df9f0037d1da04e9178a
Binary files /dev/null and b/doc/fluid/images/multigpu_allreduce.png differ
diff --git a/doc/fluid/images/multigpu_before_convert.graffle b/doc/fluid/images/multigpu_before_convert.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..6c35ab1b21fb76ceae82d3693ed0d085b5bc0855
Binary files /dev/null and b/doc/fluid/images/multigpu_before_convert.graffle differ
diff --git a/doc/fluid/images/multigpu_before_convert.png b/doc/fluid/images/multigpu_before_convert.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c8f7711165d80a2fa3911280fdee91855a401b1
Binary files /dev/null and b/doc/fluid/images/multigpu_before_convert.png differ
diff --git a/doc/fluid/images/multiple_reader.png b/doc/fluid/images/multiple_reader.png
new file mode 100644
index 0000000000000000000000000000000000000000..b22126b31db4982c13fc3a0827805e6aaf955046
Binary files /dev/null and b/doc/fluid/images/multiple_reader.png differ
diff --git a/doc/fluid/images/paddle-compile.graffle b/doc/fluid/images/paddle-compile.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..a6348cc3dbcaca923c6e794681b2edb85cb9f8f6
Binary files /dev/null and b/doc/fluid/images/paddle-compile.graffle differ
diff --git a/doc/fluid/images/paddle-compile.png b/doc/fluid/images/paddle-compile.png
new file mode 100644
index 0000000000000000000000000000000000000000..e0f13d551ac41afaec627a57dea79356464bf0bf
Binary files /dev/null and b/doc/fluid/images/paddle-compile.png differ
diff --git a/doc/fluid/images/pprof_1.png b/doc/fluid/images/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/fluid/images/pprof_1.png differ
diff --git a/doc/fluid/images/pprof_2.png b/doc/fluid/images/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/fluid/images/pprof_2.png differ
diff --git a/doc/fluid/images/profiler.png b/doc/fluid/images/profiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1
Binary files /dev/null and b/doc/fluid/images/profiler.png differ
diff --git a/doc/fluid/images/readers.png b/doc/fluid/images/readers.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd59168ce16c9e2a0ef45303c28c997cfd7740be
Binary files /dev/null and b/doc/fluid/images/readers.png differ
diff --git a/doc/fluid/images/remote_executor.graffle b/doc/fluid/images/remote_executor.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..41b2067311694b56d211a4f32d1b76884eeffd2d
Binary files /dev/null and b/doc/fluid/images/remote_executor.graffle differ
diff --git a/doc/fluid/images/remote_executor.png b/doc/fluid/images/remote_executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..744e2fb2e0f1bbe058e991ba7b2a09000965ee79
Binary files /dev/null and b/doc/fluid/images/remote_executor.png differ
diff --git a/doc/fluid/images/rnn.dot b/doc/fluid/images/rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5
--- /dev/null
+++ b/doc/fluid/images/rnn.dot
@@ -0,0 +1,87 @@
+digraph G {
+  label = "simple RNN implementation" 
+
+  ranksep=2;
+
+  //graph [nodesep=1, ranksep=1];
+
+  node[nodesep=1]
+
+  subgraph cluster0 {
+    label = "global scope"
+    rankdir = TB
+    W
+    boot_memory
+    input
+    output
+  }
+
+  subgraph cluster1 {
+    label = "step-scope 0"
+    rankdir = TB
+    memory0[label="memory"]
+    prememory0[label="pre-memory"]
+    step_input0[label="step input"]
+    step_output0[label="step output"]
+  }
+
+  subgraph cluster2 {
+    label = "step-scope 1"
+    rankdir = TB
+    memory1[label="memory"]
+    prememory1[label="pre-memory"]
+    step_input1[label="step input"]
+    step_output1[label="step output"]
+  }
+
+  subgraph cluster3 {
+    label = "step-scope 2"
+    rankdir = TB
+    memory2[label="memory"]
+    prememory2[label="pre-memory"]
+    step_input2[label="step input"]
+    step_output2[label="step output"]
+  }
+
+  stepnet [shape=box]
+  stepnet0 [shape=box, style=dashed]
+  stepnet1 [shape=box, style=dashed]
+  stepnet2 [shape=box, style=dashed]
+
+
+  edge[color=blue]
+  boot_memory -> prememory0 [label="init" color="blue"]
+  memory0 -> prememory1  [label="copy/reference" color="blue"]
+  memory1 -> prememory2 [label="copy/reference" color="blue"]
+
+  edge[color=black]
+  W -> stepnet0[constraint=false, style=dashed]
+  W -> stepnet1[constraint=false, style=dashed]
+  W -> stepnet2[constraint=false, style=dashed]
+
+  memory0 -> stepnet0[style=dashed]
+  prememory0 -> stepnet0 -> step_output0[style=dashed]
+
+  memory1 -> stepnet1[style=dashed]
+  prememory1 -> stepnet1 -> step_output1[style=dashed]
+
+  memory2 -> stepnet2[style=dashed]
+  prememory2 -> stepnet2 -> step_output2[style=dashed]
+
+  input -> step_input0
+  input -> step_input1
+  input -> step_input2
+
+  step_input0 -> stepnet0 [style=dashed]
+  step_input1 -> stepnet1[style=dashed]
+  step_input2 -> stepnet2[style=dashed]
+
+  step_output0 -> output
+  step_output1 -> output
+  step_output2 -> output
+
+  stepnet0 -> stepnet[style=dashed]
+  stepnet1 -> stepnet[style=dashed]
+  stepnet2 -> stepnet[style=dashed]
+
+}
diff --git a/doc/fluid/images/rnn.jpg b/doc/fluid/images/rnn.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9867e404cf959df0dce6ded5222b466c788fb840
Binary files /dev/null and b/doc/fluid/images/rnn.jpg differ
diff --git a/doc/fluid/images/rnn.png b/doc/fluid/images/rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..e139e373fe8396782044cfd936fdde624f8c66fe
Binary files /dev/null and b/doc/fluid/images/rnn.png differ
diff --git a/doc/fluid/images/rnn_2level_data.dot b/doc/fluid/images/rnn_2level_data.dot
new file mode 100644
index 0000000000000000000000000000000000000000..1d85ae2617a915ad0ad8288d848b607cc37ad297
--- /dev/null
+++ b/doc/fluid/images/rnn_2level_data.dot
@@ -0,0 +1,75 @@
+digraph G {
+  chapter [label="chapter"]
+
+  subgraph cluster0 {
+    label = "paragraph 0"
+
+    top_rnn0[label="top rnn step 0" shape=box]
+
+    p0 [label="paragraph 0"]
+    p1 [label="paragraph 1"]
+  }
+
+  subgraph cluster1{
+    label = "paragraph 1"
+
+    top_rnn1[label="top rnn step 1" shape=box]
+
+    p2 [label="paragraph 0"]
+    p3 [label="paragraph 1"]
+  }
+
+  subgraph cluster_p0 {
+    label = "sentence 0"
+
+    low_rnn0 [label="low rnn step 0" shape=box]
+    s00 [label="sentence 0"]
+    s01 [label="sentence 1"]
+
+    low_rnn0 -> s00
+    low_rnn0 -> s01
+  }
+
+  subgraph cluster_p1 {
+    label = "sentence 1"
+    low_rnn1 [label="low rnn step 1" shape=box]
+    s10 [label="sentence 0"]
+    s11 [label="sentence 1"]
+    low_rnn1 -> s10
+    low_rnn1 -> s11
+  }
+
+  subgraph cluster_p2 {
+    label = "sentence 1"
+    low_rnn2 [label="low rnn step 0" shape=box]
+    s20 [label="sentence 0"]
+    s21 [label="sentence 1"]
+    low_rnn2 -> s20
+    low_rnn2 -> s21
+  }
+
+  subgraph cluster_p3 {
+    label = "sentence 1"
+    low_rnn3 [label="low rnn step 1" shape=box]
+    s30 [label="sentence 0"]
+    s31 [label="sentence 1"]
+    low_rnn3 -> s30
+    low_rnn3 -> s31
+  }
+
+
+  chapter -> top_rnn0
+  chapter -> top_rnn1
+
+  top_rnn0 -> p0
+  top_rnn0 -> p1
+  top_rnn1 -> p2
+  top_rnn1 -> p3
+
+
+  p0 -> low_rnn0
+  p1 -> low_rnn1
+  p2 -> low_rnn2
+  p3 -> low_rnn3
+
+}
diff --git a/doc/fluid/images/rnn_2level_data.png b/doc/fluid/images/rnn_2level_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..4be81b2430717a6a506342a09fc26899568574c6
Binary files /dev/null and b/doc/fluid/images/rnn_2level_data.png differ
diff --git a/doc/fluid/images/single-thread@3x.png b/doc/fluid/images/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/fluid/images/single-thread@3x.png differ
diff --git a/doc/fluid/images/sparse_update.graffle b/doc/fluid/images/sparse_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..08d689a58f83698d8c1158ee3990ed8abf3a7a9a
Binary files /dev/null and b/doc/fluid/images/sparse_update.graffle differ
diff --git a/doc/fluid/images/sparse_update.png b/doc/fluid/images/sparse_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c872e6ac479f7d1b818a4a207956c43155d0ad7
Binary files /dev/null and b/doc/fluid/images/sparse_update.png differ
diff --git a/doc/fluid/images/test.dot b/doc/fluid/images/test.dot
new file mode 100644
index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a
--- /dev/null
+++ b/doc/fluid/images/test.dot
@@ -0,0 +1,35 @@
+
+digraph Test {
+    z -> generator -> G_img;
+    G_img -> discriminator -> D_f -> d_loss_f;
+    label0 -> d_loss_f -> d_loss;
+
+    img -> discriminator -> D_t -> d_loss_t;
+    label1 -> d_loss_t -> d_loss;
+
+    d_loss -> d_loss_t[color=red, style=dashed];
+    d_loss -> d_loss_f[color=red, style=dashed];
+    d_loss_t -> D_t[color=red, style=dashed];
+    d_loss_f -> D_f[color=red, style=dashed];
+    D_t -> discriminator[color=red, style=dashed];
+    D_f -> discriminator[color=red, style=dashed];
+
+    D_f -> g_loss;
+    label2 -> g_loss;
+
+    g_loss -> D_f[color=green, style=dashed];
+    D_f -> discriminator[color=green, style=dashed];
+    discriminator -> G_img[color=green, style=dashed];
+    G_img -> generator[color=green, style=dashed];
+
+    discriminator [color=red, shape=box];
+    generator [color=green, shape=box];
+    z [shape=diamond];
+    img [shape=diamond];
+    label0 [shape=diamond];
+    label1 [shape=diamond];
+    label2 [shape=diamond];
+
+    d_loss [color=red];
+    g_loss [color=green];
+}
diff --git a/doc/fluid/images/test.dot.png b/doc/fluid/images/test.dot.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55
Binary files /dev/null and b/doc/fluid/images/test.dot.png differ
diff --git a/doc/fluid/images/theta_star.gif b/doc/fluid/images/theta_star.gif
new file mode 100644
index 0000000000000000000000000000000000000000..dd24d33e124396be3fc410c9b12f33148f64efe2
Binary files /dev/null and b/doc/fluid/images/theta_star.gif differ
diff --git a/doc/fluid/images/timeline.jpeg b/doc/fluid/images/timeline.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..38ec3f80c982857531f30a8bb0fa26ea5bf05385
Binary files /dev/null and b/doc/fluid/images/timeline.jpeg differ
diff --git a/doc/fluid/images/tracing.jpeg b/doc/fluid/images/tracing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3a49fc4f8a401a9463b0157e2f38c164ca02dcc5
Binary files /dev/null and b/doc/fluid/images/tracing.jpeg differ
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 260b6c9fd1b364433cae098bacea77aa7fe9e266..76b82fd97f1ed642696c4414676b694ebda9ad81 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index e5757b86b43001bc6090d8edd0aaa5ff4fc476ee..5aa5c1381fa3fad4ebc181c7868da03ae0138016 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle
diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt
index 82de7a3a3e1ca7724e1eda877d53454a4fa4129a..be957d37b14c618e9346251b3bd3dbaf1541773f 100644
--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
@@ -27,7 +27,7 @@ sphinx_add_target(paddle_v2_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_v2_docs gen_proto_py)
+add_dependencies(paddle_v2_docs gen_proto_py paddle_python)
 
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
@@ -50,6 +50,6 @@ sphinx_add_target(paddle_v2_docs_cn
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
 
-add_dependencies(paddle_v2_docs_cn gen_proto_py)
+add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python)
 
 add_subdirectory(api)
diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt
index da1eafc02ed8cd155d4f0f1fbadcb7b237b6fcc1..2670a21a227546ffcee4f10f395feef3c58df9b4 100644
--- a/doc/v2/api/CMakeLists.txt
+++ b/doc/v2/api/CMakeLists.txt
@@ -19,4 +19,4 @@ sphinx_add_target(paddle_v2_apis
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
+add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/v2/faq/build_and_install/index_en.rst b/doc/v2/faq/build_and_install/index_en.rst
index 614db457d715665073cec1a495d4d7df6887532f..7488ed8137d57785f36b9f1e1ed1269f864960bc 100644
--- a/doc/v2/faq/build_and_install/index_en.rst
+++ b/doc/v2/faq/build_and_install/index_en.rst
@@ -1,5 +1,143 @@
-############################
-Install, Build and Unit test
-############################
+.. _install_faq:
 
-TBD
+###############################
+Compile, Install, and Unit Test
+###############################
+
+..  contents::
+
+1. Insufficient CUDA driver version
+----------------------------------------------------------------
+
+Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory.
+You can solve the issue by running the following commands:
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_en.html>`_ .
+
+
+2. Version mismatch between PythonLibs and PythonInterpreter
+----------------------------------------------------------------
+
+It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows.
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+You should specify ``<exc_path>``, ``<lib_path>``, ``<inc_path>`` to your local paths.
+
+3. PaddlePaddle version is 0.0.0
+------------------------------------------------
+This issue would happen when you run the code  `paddle version` or `cmake ..`
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+
+You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake`
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1.
+
+You can upgrade Pip with the following command\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation.
+
+If the system supports :code:`linux_x86_64` and  the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest 
+
+if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again.
+
+
+5. ImportError: No module named v2
+----------------------------------
+Please uninstall Paddle V1 if you have installed it before.
+
+..  code-block:: bash
+
+    pip uninstall py_paddle paddle
+
+Then install Python for PaddlePaddle , enter the build directory and run the following commands
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. Illegal instruction
+-----------------------
+This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version.
+
+7.  Python unittest fails
+--------------------------------
+
+If the following python unittest testcases fail:
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+
+Please check the PaddlePaddle unittest logs which may suggest the following:
+
+..  code-block:: bash
+
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+The solution is:
+
+* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory.  Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package.
+
+
+8. Failed to download the MKLML library
+----------------------------------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2
+    make[1]: *** waiting for the unfinished  jobs....
+
+Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully.
+
+The solution is: manually download and install, the specific steps are as follows.
+
+..  code-block:: bash
+
+    // 1. enter the directory
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. check the size of the package, normally 75M, if less than 75M, the download fails
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. manually download and unzip and make the download success tag:
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. then compile
+    
diff --git a/doc/v2/howto/cmd_parameter/index_en.rst b/doc/v2/howto/cmd_parameter/index_en.rst
index 0e3c72d27aca063f1b6f1c23e55718dba373c40a..f49683948ef78f363e2439cc25332431830eeb24 100644
--- a/doc/v2/howto/cmd_parameter/index_en.rst
+++ b/doc/v2/howto/cmd_parameter/index_en.rst
@@ -2,10 +2,25 @@
 
 Set Command-line Parameters
 ===========================
+The implementation of deep learning algorithms has a variety of characteristics, such as running environment, running stage, structure of the model and the traning strategy. PaddlePaddle supports the user to set various command-line parameters flexibly, which helps to achieve control of the model training or prediction process.
+
+In this part, we take several actual scenarios as an example, and the use of some command-line parameters is displayed:
 
 ..  toctree::
   :maxdepth: 1
 
   use_case_en.md
+
+Then, we summarize and classify the use of all command-line parameters:
+
+..  toctree::
+  :maxdepth: 1
+
   arguments_en.md
+
+Finally, the detailed descriptions are given, and we try to explain the propeties and significance of these command-line parameters in detail:
+
+..  toctree::
+  :maxdepth: 1
+
   detail_introduction_en.md
diff --git a/doc/v2/howto/rnn/recurrent_group_en.md b/doc/v2/howto/rnn/recurrent_group_en.md
index d264b0a9f85faffd49c1982117cb5a3ac6ffc015..de6b60f29eb97029a54609cd2194bb7faf3ffec5 100644
--- a/doc/v2/howto/rnn/recurrent_group_en.md
+++ b/doc/v2/howto/rnn/recurrent_group_en.md
@@ -1,3 +1,96 @@
 # Recurrent Group Tutorial
 
-TBD
+## Overview
+
+Sequential data is common in natural language processing.
+
+A sentence is a sequence of words and many sentences form a paragraph further. Therefore, a paragraph can be viewed as a nested sequence with two level, where each element of the sequence is another sequence. That is to say, sequential data could be recursive. An example of two-level recursive sequential data is that an article is composed of a sequence of sentences, and each sentence a sequence of words.
+
+PaddlePaddle and PaddlePaddle v2 support two-level recursive sequential data. The two-level sequence is a very flexible data, which helps us to better describe more complex language data such as discribing paragraphs and several rounds of dialogues. Based on two-level sequence input, we can design and build a flexible, hierarchical RNN model that encodes input data from the word and sentence level. For the support of arbitrary levels, please refer to PaddlePaddle Fluid.
+
+In PaddlePaddle, `recurrent_group` is an arbitrarily complex RNN unit. The user only needs to define the calculation that the RNN will complete in one time step. PaddlePaddle is responsible for the propagation of information and error in time series.
+
+Furthermore, `recurrent_group` can also be extended to handle two-level sequence. By defining two nested `recurrent_group` operations at the clause level and the word level respectively, a hierarchical and complex RNN is finally achieved.
+
+Currently, in the PaddlePaddle, there are `recurrent_group` and some Layers that can process bidirectional sequences. For details, refer to the document: <a href = "hierarchical_layer_en.html">Layers for supporting double-layer sequences as input.</a>
+
+## Related Concepts
+
+### Basic Principle 
+`recurrent_group` is an arbitrarily complex RNN unit supported by PaddlePaddle. The user only needs to focus on the calculations that the RNN is designed to complete within a single time step. The PaddlePaddle is responsible for completing the propagation of information and gradients over time.
+
+In PaddlePaddle, a simple call to `recurrent_group` is as follows:
+
+``` python 
+recurrent_group(step, input, reverse) 
+```
+- step: A callable function that defines the calculations completed by the RNN unit within a time step
+- input: The input must be a single-layer sequence or a double-layer sequence
+- reverse: Whether to process the input sequence in reverse order
+
+The core of using `recurrent_group` is to design the logic of the step function. The step function can be freely combined with various layers supported by PaddlePaddle to complete arbitrary arithmetic logic. The input of `recurrent_group` (input) becomes the input of the step function. Since the step function only focuses on the calculation within one time step of RNN, here `recurrent_group` completes the splitting of the original input data for us.
+
+### Input
+The input sequence processed by `recurrent_group` is mainly divided into the following three types:
+
+- **Input Data**: When putting a two-level sequence into `recurrent_group`, it will be disassembled into a single-level sequence. When putting a single-level sequence into `recurrent_group`, it will be disassembled into a non-sequence and then passed to the step function. This process is completely transparent to the user. There are two possible types: 1) User input via data_layer; 2) Output from other layers.
+		
+- **Read-only Memory Input**: `StaticInput` defines a read-only Memory. The input specified by `StaticInput` will not be disassembled by `recurrent_group`, and each time step of the `recurrent_group` loop will always be able to reference all inputs. It may be a non-sequence or a single-layer sequence.
+	  
+- **Input of Sequence Generation Task**: `GeneratedInput` is only used to specify input data in a sequence generation task.
+
+### Input Example
+
+Sequence generation tasks mostly follow the encoder-decoer architecture. The encoder and decoder can be arbitrary neural network units capable of processing sequences and RNN is the most popular choice.
+
+Given the encoder output and the current word, the decoder predicts the next most likely word each time. In this structure, the decoder accepts two inputs:
+
+- Target sequence to be generated: a input of the decoder and the basis of the decoder loop. `recurrent_group` will disassemble this input type.
+
+- Encoder output, an non-sequencce or single-sequence: a unbounded memory. Each time step in the decoder loop will reference the entire result and should not be disassembled. This type of input must be specified via `StaticInput`. For more discussion on Unbounded Memory, please refer to the paper [Neural Turning Machine](https://arxiv.org/abs/1410.5401).
+
+In a sequence generation task, the decoder RNN always refers to the word vector of the word predicted at the previous moment as the current time input. `GeneratedInput` will automate this process.
+
+### Output
+The `step` function must return the output of one or more Layers. The output of this Layer will be the final output of the entire `recurrent_group`. In the output process, `recurrent_group` will concatenate the output of each time step, which is also transparent to the user.
+
+### Memory
+Memory can only be defined and used in `recurrent_group`. Memory cannot exist independently and must point to a layer defined by PaddlePaddle. Memory is referenced to get a momentary output from this layer, so memory can be interpreted as a delay operation.
+
+The user can explicitly specify the output of a layer to initialize the memory. When not specified, memory is initialized to 0 by default.
+
+## Sequence-level RNN Introduction
+
+`recurrent_group` helps us to split the input sequence, merge the output, and loop through the sequence of computational logic.
+
+Using this feature, the two nested `recurrent_group` can handle the nested two-level sequences, implementing sequence-level RNN structures at both the word and sentence levels.
+
+- Word-level RNN:  each state corresponds to a word.
+- Sequence-level RNN: a sequence-layer RNN consists of multiple word-layer RNNs. Each word-layer RNN (ie, each state of a sequence-layer RNN) has a subsequence.
+
+For convenience of description, the following takes the NLP task as an example. A paragraph containing a subsequence is defined as a two-level sequence, and a sentence containing a word is defined as a single-layer sequence. Then, the zero-level sequence is a word.
+
+## Usage of Sequence-level RNN
+
+### Usage of Training Process
+Using `recurrent_group` requires the following conventions:
+
+- **Single-input Single-output**: Both input and output are single layer sequences.
+  - If there are multiple inputs, the number of words in different input sequences must be exactly equal.
+  - A single-layer sequence is output, and the number of words in the output sequence is the same as the input sequence.
+  - memory: define memory to point to a layer in the step function, get a moment output from this layer by referencing memory to form a recurrent connection. The is_seq parameter of memory must be false. If memory is not defined, the operations within each time step are independent.
+  - boot_layer: the initial state of memory, set 0 by default. is_seq in memory must be false.
+ 
+- **Double-input Double-output**: Both input and output are two-level sequence.
+  - If there are multiple input sequences, the number of subsequence contained in different inputs must be strictly equal, but the number of words in the subsequence may not be equal.
+  - output a two-level sequence. The number of subsequence and the number of words are the same as the specified input sequence and the first input is default.
+  - memory: defining memory in the step function, pointing to a layer, by referring to the memory to get the output of this layer at a time, forming a recurrent connection. The memory defined in the outer `recurrent_group` step function can record the state of the previous subsequence, either as a single-level sequence (only as read-only memory) or as a word. If memory is not defined, the operations between subsequence are independent.
+  - boot_layer: the initial state of memory. It is either a single-level sequence (only as read-only memory) or a vector. The default is not set, that is, the initial state is 0.
+
+- **Double-input Single-output**: not support for now, and output the error with "In hierachical RNN, all out links should be from sequences now".
+ 
+### Usage of Generation Process
+Using `beam_search` need follow those conventions: 
+
+- Word-level RNN: generate the next word from a word.
+- Sequence-level RNN: the single-layer RNN generated subsequence is concatenated into a new double-layer sequence. Semantically, there is no case where a subsequence generates the next subseq directly.
diff --git a/paddle/.gitignore b/paddle/.gitignore
index f921eef14156a97e4fd250f014960e306b43f35a..1c1c0c2c829f088d7e3f52ca007fcb8f33a16a36 100644
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@@ -1,3 +1,4 @@
+.timestamp
 *.o
 *.a
 .svn
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index cf84568ecdf1227b0d0ed3606a4a9a6e5186af72..06e1f5d5f0884efabfcdf917ca5c35d94ad5dce9 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -89,16 +89,17 @@ SWIG_LINK_LIBRARIES(swig_paddle
     ${START_END}
 )
 
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
-    COMMAND ${CMAKE_COMMAND} -E touch .timestamp
+add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/.timestamp
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
     DEPENDS _swig_paddle
 )
 
 # TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)
+add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so)
 
 if(WITH_TESTING)
     IF(NOT PY_PIP_FOUND)
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
index 761aeb5b174105edece8880a9f5012c13a63fd11..13cb79129cc2272d215cdb475fb146b37266699e 100644
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
@@ -1,3 +1,8 @@
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/testTrain.py
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/*.py ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_api_test ALL DEPENDS testTrain.py)
+
 py_test(testTrain SRCS testTrain.py)
 py_test(testMatrix SRCS testMatrix.py)
 py_test(testVector SRCS testVector.py)
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 63ec51564793ca2255032d0efbe2c47326f8b698..b790fa39fe863bbb00f6cd36d4c63481b7634fe1 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -370,4 +370,48 @@ extern void hl_maxout_backward(real* inGrad,
                                size_t featLen,
                                size_t groups);
 
+/**
+ * @brief   Upsample forward.
+ * @param[in]   inputData   input data.
+ * @param[out]  maskData    the mask data from MaxPoolWithMaskLayer.
+ * @param[out]  batchSize   the batch size of the input.
+ * @param[in]   imgSizeH    image height.
+ * @param[in]   imgSizeW    image width.
+ * @param[in]   channels    the input channels.
+ * @param[in]   outputH     the output height.
+ * @param[in]   outputW     the output widht.
+ * @param[out]  outputData  output data.
+ */
+extern void hl_upsample_forward(real* inputData,
+                                real* maskData,
+                                size_t batchSize,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW,
+                                real* outputData);
+
+/**
+ * @brief   Upsample backward.
+ * @param[in]   outputGradData  the output grad data.
+ * @param[out]  maskData    the mask data from MaxPoolWithMaskLayer.
+ * @param[out]  batchSize       the batch size of the input.
+ * @param[in]   imgSizeH        image height.
+ * @param[in]   imgSizeW        image width.
+ * @param[in]   channels        the input channels.
+ * @param[in]   outputH         the output height.
+ * @param[in]   outputW         the output widht.
+ * @param[out]  inputGradData   the input grad data.
+ */
+extern void hl_upsample_backward(real* outputGradData,
+                                 real* maskData,
+                                 size_t batchSize,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real* inputGradData);
+
 #endif  // HL_CNN_H_
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index c39bd3228d3f2ea7495cd21f5ff60bdfbbd2b51d..997eed62e07827f375c7441554b397fdd0bd6a80 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -224,4 +224,24 @@ inline void hl_maxout_backward(real* inGrad,
                                size_t featLen,
                                size_t group) {}
 
+inline void hl_upsample_forward(real* inputData,
+                                real* maskData,
+                                size_t batchSize,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW,
+                                real* outputData) {}
+
+inline void hl_upsample_backward(real* outputGradData,
+                                 real* maskData,
+                                 size_t batchSize,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real* inputGradData) {}
+
 #endif  // HL_CNN_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index a4459243e8a7c8be58be2255faf89e29817fbdf5..bac743a293cc97b114281e510d06367a86536452 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -1028,3 +1028,79 @@ void hl_maxout_backward(real* inGrad,
       num_kernels, inGrad, outGrad, idData, size, featLen, groups);
   CHECK_SYNC("hl_maxout_backward failed");
 }
+
+__global__ void upsampleForwardCompute(real* input_data,
+                                       real* mask_data,
+                                       size_t nthreads,
+                                       size_t in_h,
+                                       size_t in_w,
+                                       size_t out_h,
+                                       size_t out_w,
+                                       real* output_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int offset = index / (in_w * in_h) * out_h * out_w;
+    int upsample_idx = static_cast<int>(mask_data[index]);
+    output_data[offset + upsample_idx] = input_data[index];
+  }
+}
+
+__global__ void upsampleBackwardCompute(real* out_grad,
+                                        real* mask_data,
+                                        size_t nthreads,
+                                        size_t in_h,
+                                        size_t in_w,
+                                        size_t out_h,
+                                        size_t out_w,
+                                        real* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int offset = index / (in_w * in_h) * out_h * out_w;
+    int upsample_idx = static_cast<int>(mask_data[index]);
+    input_grad[index] = out_grad[offset + upsample_idx];
+  }
+}
+
+void hl_upsample_forward(real* inputData,
+                         real* maskData,
+                         size_t batchSize,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t channels,
+                         size_t outputH,
+                         size_t outputW,
+                         real* outputData) {
+  int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  upsampleForwardCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inputData,
+                                                              maskData,
+                                                              num_kernels,
+                                                              imgSizeH,
+                                                              imgSizeW,
+                                                              outputH,
+                                                              outputW,
+                                                              outputData);
+  CHECK_SYNC("hl_upsample_forward failed");
+}
+
+void hl_upsample_backward(real* outputGradData,
+                          real* maskData,
+                          size_t batchSize,
+                          size_t imgSizeH,
+                          size_t imgSizeW,
+                          size_t channels,
+                          size_t outputH,
+                          size_t outputW,
+                          real* inputGradData) {
+  int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  upsampleBackwardCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(outputGradData,
+                                                               maskData,
+                                                               num_kernels,
+                                                               imgSizeH,
+                                                               imgSizeW,
+                                                               outputH,
+                                                               outputW,
+                                                               inputGradData);
+  CHECK_SYNC("hl_upsample_backward failed");
+}
diff --git a/paddle/fluid/framework/.clang-format b/paddle/fluid/.clang-format
similarity index 100%
rename from paddle/fluid/framework/.clang-format
rename to paddle/fluid/.clang-format
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index a34e22ff8765fccbd5ac3a284b7c6820f0055ec3..3840bbe83b68dc2a49aa73feb57a80e9992cad5f 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -7,9 +7,9 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place memory device_context framework_proto)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place memory device_context framework_proto)
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -21,9 +21,9 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
-cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
 
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
@@ -74,8 +74,8 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 add_custom_command(TARGET framework_py_proto POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto
-    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+    COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
     COMMENT "Copy generated python proto into directory paddle/fluid/proto."
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
@@ -104,7 +104,7 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
       
-# cc_test(channel_test SRCS channel_test.cc)
+cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
         channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 468423e0e8e7b8c9ebc14b7568c9c3bd21645ea7..873969b2a884f6d9e133fe87bf72725c36ce8b98 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <deque>
 #include <memory>
 #include <set>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
@@ -96,6 +97,8 @@ class BlockDesc {
    */
   void RemoveOp(size_t s, size_t e);
 
+  void RemoveVar(const std::string &name) { vars_.erase(name); }
+
   std::vector<OpDesc *> AllOps() const;
 
   size_t OpSize() const { return ops_.size(); }
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index 019bea600f496a6b58579ad0aa8af836cd6134a9..722bf8e8ecba0c9cbc5e3ad737dbf73148d2873c 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include <stddef.h>  // for size_t
-#include <condition_variable>
+#include <stddef.h>            // for size_t
+#include <condition_variable>  // NOLINT
 #include <typeindex>
 #include "paddle/fluid/platform/enforce.h"
 
@@ -216,7 +216,8 @@ class ChannelHolder {
 
   template <typename T>
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(size_t buffer_size) : type_(std::type_index(typeid(T))) {
+    explicit PlaceholderImpl(size_t buffer_size)
+        : type_(std::type_index(typeid(T))) {
       channel_.reset(MakeChannel<T>(buffer_size));
     }
 
diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h
index c47d629289af2c3d1f7c30d711d338745bf5234c..26d454534e1ae38c4f83376c0836a45781ea9101 100644
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stddef.h>  // for size_t
 #include <atomic>
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <deque>
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -38,7 +38,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
   virtual void Unlock();
   virtual bool IsClosed();
   virtual void Close();
-  ChannelImpl(size_t);
+  explicit ChannelImpl(size_t);
   virtual ~ChannelImpl();
 
   virtual void AddToSendQ(const void *referrer, T *data,
@@ -60,7 +60,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
     const void *referrer;  // TODO(thuan): figure out better way to do this
     std::function<bool(ChannelAction)> callback;
 
-    QueueMessage(T *item)
+    explicit QueueMessage(T *item)
         : data(item), cond(std::make_shared<std::condition_variable_any>()) {}
 
     QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
@@ -88,15 +88,15 @@ class ChannelImpl : public paddle::framework::Channel<T> {
   }
 
   std::shared_ptr<QueueMessage> get_first_message(
-      std::deque<std::shared_ptr<QueueMessage>> &queue, ChannelAction action) {
-    while (!queue.empty()) {
+      std::deque<std::shared_ptr<QueueMessage>> *queue, ChannelAction action) {
+    while (!queue->empty()) {
       // Check whether this message was added by Select
       // If this was added by Select then execute the callback
       // to check if you can execute this message. The callback
       // can return false if some other case was executed in Select.
       // In that case just discard this QueueMessage and process next.
-      std::shared_ptr<QueueMessage> m = queue.front();
-      queue.pop_front();
+      std::shared_ptr<QueueMessage> m = queue->front();
+      queue->pop_front();
       if (m->callback == nullptr || m->callback(action)) return m;
     }
     return nullptr;
@@ -138,8 +138,8 @@ void ChannelImpl<T>::Send(T *item) {
 
   // If channel is closed, throw exception
   if (closed_) {
-    lock.unlock();
     send_return();
+    lock.unlock();
     PADDLE_THROW("Cannot send on closed channel");
   }
 
@@ -147,16 +147,14 @@ void ChannelImpl<T>::Send(T *item) {
   // to send to the receiver, bypassing the channel buffer if any
   if (!recvq.empty()) {
     std::shared_ptr<QueueMessage> m =
-        get_first_message(recvq, ChannelAction::SEND);
+        get_first_message(&recvq, ChannelAction::SEND);
 
     if (m != nullptr) {
       *(m->data) = std::move(*item);
       m->Notify();
-      lock.unlock();
       send_return();
       return;
     } else {
-      lock.unlock();
       Send(item);
       send_return();
       return;
@@ -169,8 +167,6 @@ void ChannelImpl<T>::Send(T *item) {
   if (buf_.size() < cap_) {
     // Copy to buffer
     buf_.push_back(std::move(*item));
-    // Release lock and return true
-    lock.unlock();
     send_return();
     return;
   }
@@ -181,8 +177,8 @@ void ChannelImpl<T>::Send(T *item) {
   sendq.push_back(m);
   m->Wait(lock);
   if (m->chan_closed) {
-    lock.unlock();
     send_return();
+    lock.unlock();
     PADDLE_THROW("Cannot send on closed channel");
   }
   send_return();
@@ -195,17 +191,14 @@ bool ChannelImpl<T>::Receive(T *item) {
 
   // If channel is closed and buffer is empty or
   // channel is unbuffered
-  if (closed_ && buf_.empty()) {
-    lock.unlock();
-    return recv_return(false);
-  }
+  if (closed_ && buf_.empty()) return recv_return(false);
 
   // If there is a sender, directly receive the value we want
   // from the sender. In case of a buffered channel, read from
   // buffer and move front of send queue to the buffer
   if (!sendq.empty()) {
     std::shared_ptr<QueueMessage> m =
-        get_first_message(sendq, ChannelAction::RECEIVE);
+        get_first_message(&sendq, ChannelAction::RECEIVE);
     if (buf_.size() > 0) {
       // Case 1 : Channel is Buffered
       // Do Data transfer from front of buffer
@@ -226,10 +219,10 @@ bool ChannelImpl<T>::Receive(T *item) {
       if (m != nullptr) {
         *item = std::move(*(m->data));
         m->Notify();
-      } else
+      } else {
         return recv_return(Receive(item));
+      }
     }
-    lock.unlock();
     return recv_return(true);
   }
 
@@ -238,8 +231,7 @@ bool ChannelImpl<T>::Receive(T *item) {
     // Directly read from buffer
     *item = std::move(buf_.front());
     buf_.pop_front();
-    // Release lock and return true
-    lock.unlock();
+    // return true
     return recv_return(true);
   }
 
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
index 1184bfdae1940286fb72d9091ae4f23ff7f84a54..542d791f6bbdf7d68a4786998ccc0233fff6473d 100644
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/channel.h"
 
-#include <chrono>
-#include <thread>
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
 #include "gtest/gtest.h"
 
 using paddle::framework::Channel;
@@ -166,9 +166,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
   std::thread t([&]() {
     // Try to write more than buffer size.
     for (size_t i = 0; i < 2 * buffer_size; ++i) {
-      if (i < buffer_size)
+      if (i < buffer_size) {
         ch->Send(&i);  // should block after 10 iterations
-      else {
+      } else {
         bool is_exception = false;
         try {
           ch->Send(&i);
@@ -212,12 +212,12 @@ TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
 }
 
 void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -230,7 +230,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
 
@@ -241,21 +241,21 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
@@ -277,13 +277,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
   if (isBuffered) {
     // If ch is Buffered, atleast 4 threads must be blocked.
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (!thread_ended[i]) ct++;
     }
     EXPECT_GE(ct, 4);
   } else {
     // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -294,21 +294,21 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   if (isBuffered) {
     // Verify that only 1 send was successful
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (send_success[i]) ct++;
     }
     // Only 1 send must be successful
     EXPECT_EQ(ct, 1);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that closing a buffered channel also unblocks
@@ -409,13 +409,13 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
 // This tests that destroying a channel unblocks
 //  any senders waiting for channel to have write space
 void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
@@ -438,14 +438,14 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
   if (isBuffered) {
     // If channel is buffered, verify that atleast 4 threads are blocked
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (thread_ended[i] == false) ct++;
     }
     // Atleast 4 threads must be blocked
     EXPECT_GE(ct, 4);
   } else {
     // Verify that all the threads are blocked
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -454,13 +454,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   // Count number of successful sends
   int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     if (send_success[i]) ct++;
   }
 
@@ -473,18 +473,18 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
   }
 
   // Join all threads
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that destroying a channel also unblocks
 //  any receivers waiting on the channel
 void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -498,18 +498,18 @@ void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
 
   // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
   // delete the channel
   delete ch;
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
@@ -679,12 +679,12 @@ TEST(ChannelHolder, TypeMismatchReceiveTest) {
 }
 
 void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -697,7 +697,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
 
@@ -708,21 +708,21 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
@@ -744,13 +744,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
   if (isBuffered) {
     // If ch is Buffered, atleast 4 threads must be blocked.
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (!thread_ended[i]) ct++;
     }
     EXPECT_GE(ct, 4);
   } else {
     // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -761,21 +761,21 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   if (isBuffered) {
     // Verify that only 1 send was successful
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (send_success[i]) ct++;
     }
     // Only 1 send must be successful
     EXPECT_EQ(ct, 1);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that closing a channelholder unblocks
@@ -813,13 +813,13 @@ TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
 // This tests that destroying a channelholder unblocks
 //  any senders waiting for channel
 void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
@@ -841,14 +841,14 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
   if (isBuffered) {
     // If channel is buffered, verify that atleast 4 threads are blocked
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (thread_ended[i] == false) ct++;
     }
     // Atleast 4 threads must be blocked
     EXPECT_GE(ct, 4);
   } else {
     // Verify that all the threads are blocked
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -857,13 +857,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   // Count number of successfuld sends
   int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     if (send_success[i]) ct++;
   }
 
@@ -876,18 +876,18 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
   }
 
   // Join all threads
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that destroying a channelholder also unblocks
 //  any receivers waiting on the channel
 void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -901,18 +901,18 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
   // delete the channel
   delete ch;
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
@@ -945,12 +945,12 @@ TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
 
 // This tests that closing a channelholder many times.
 void ChannelHolderManyTimesClose(ChannelHolder *ch) {
-  const int num_threads = 15;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const int kNumThreads = 15;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to send data to channel.
-  for (size_t i = 0; i < num_threads / 3; i++) {
+  for (size_t i = 0; i < kNumThreads / 3; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *ended) {
@@ -962,7 +962,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
   }
 
   // Launches threads that try to receive data to channel.
-  for (size_t i = num_threads / 3; i < 2 * num_threads / 3; i++) {
+  for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -976,7 +976,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
   }
 
   // Launches threads that try to close the channel.
-  for (size_t i = 2 * num_threads / 3; i < num_threads; i++) {
+  for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -991,13 +991,13 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
 
   // Verify that all threads are unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
   EXPECT_TRUE(ch->IsClosed());
   // delete the channel
   delete ch;
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index bf1a705ef50b663efa53393ead1f81fd6bcf8c48..89b5c6847f15b3f2a270fe1e7db9e590549e8982 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -16,6 +16,6 @@ else()
 endif()
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
             scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph)
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index a1b913a863cc1853ea3a786d22e6e8baa8c98a02..128a5344fbb8c64c36ade24475bd0d99bdb3e0f5 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -21,6 +21,9 @@
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #endif
 
+#include <string>
+#include <vector>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -55,6 +58,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
   auto graph = new SSAGraph();
   SSAGraph &result = *graph;
+  std::unordered_set<std::string> og_has_been_broadcast;
   result.vars_.resize(places_.size());
 
   bool is_forwarding = true;
@@ -122,9 +126,15 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 
     if (!is_forwarding) {
       auto var_names = op->OutputArgumentNames();
+      // Currently, we assume that once gradient is generated, it can be
+      // broadcast, and each gradient is only broadcast once. But there are no
+      // other cases, for example, we need to adjust the gradient according to
+      // the input when we get the gradient, which is not considered at present.
       for (auto &og : var_names) {
-        if (grad_names_.count(og) != 0) {  // is param grad
-                                           // Insert NCCL AllReduce Op
+        if (grad_names_.count(og) != 0 &&
+            og_has_been_broadcast.count(og) == 0) {  // is param grad
+                                                     // Insert NCCL AllReduce Op
+          og_has_been_broadcast.insert(og);
 #ifdef PADDLE_WITH_CUDA
           result.ops_.emplace_back(
               new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
@@ -161,6 +171,11 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    */
   PolishGraphToSupportDataHazards(&result);
 
+  /*
+   * Only variables should be the leaves of graph.
+   */
+  AddOutputToLeafOps(&result);
+
   if (VLOG_IS_ON(10)) {
     std::ostringstream sout;
     PrintGraphviz(*graph, sout);
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index 5ddf331cfca39a4e81a42d9ff8efd5af7bcf6829..55b5f113589e090386d287e228349f22fb94a7ab 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -76,7 +76,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
   }
 }
 
-std::string NCCLAllReduceOpHandle::Name() const { return "NCCL AllReduce"; }
+std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
index 045070bb6a97e90600cd24d9f43cd2a10a4bc1f5..ad14a3c5cb4625fa121cad2daed389c441e78771 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -34,6 +37,10 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
 
   std::string Name() const override;
 
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
+  bool IsMultiDeviceTransfer() override { return true; };
+
  protected:
   void RunImpl() override;
 };
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 71672fd24c65ee654fb9f703ea5808c31ee8fbb0..d7a541ac4bb83625060db337446d03a1afda3ed0 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include <string>
+#include <vector>
 
 #include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -53,6 +55,10 @@ class OpHandleBase {
 
   void AddOutput(VarHandleBase *out);
 
+  // If the Op involves data transfer of multiple devices that
+  // will likely block other computations.
+  virtual bool IsMultiDeviceTransfer() { return false; }
+
  protected:
   virtual void RunImpl() = 0;
 };
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 361ba6d39721eed406a30fea325b3b4508ec45d0..0a4febd22f3feefdcac99cafc2cb58269380d192 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -136,6 +136,17 @@ void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
 
   sout << "}\n";
 }
+
+void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
+  for (auto &op : graph->ops_) {
+    if (!op->outputs_.empty()) {
+      continue;
+    }
+    auto *dummy_leaf = new DummyVarHandle();
+    graph->dep_vars_.emplace(dummy_leaf);
+    op->AddOutput(dummy_leaf);
+  }
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index bf20e7164a100718c1dcfe3ef971cfff60bbbaa2..be1f0460e45402806b18835f054a7195df1374cc 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -14,13 +14,13 @@
 
 #pragma once
 
+#include <memory>
+#include <string>
+
 #include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/place.h"
 
-#include <memory>
-#include <string>
-
 namespace paddle {
 namespace framework {
 namespace details {
@@ -52,6 +52,8 @@ class SSAGraphBuilder {
                              const std::string &each_var_name,
                              const platform::Place &place, size_t place_offset);
 
+  static void AddOutputToLeafOps(SSAGraph *graph);
+
   static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
 };
 }  // namespace details
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 3f8655147b688239509dea98925df310a46cbef8..596e5731868630cebc3cf51b2e78d4deb39a9b33 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -23,22 +23,36 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     size_t num_threads, bool use_event,
     const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
-    std::unique_ptr<SSAGraph> &&graph)
+    std::unique_ptr<SSAGraph> &&graph, bool allow_op_delay)
     : SSAGraphExecutor(std::move(graph)),
       pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr),
       local_scopes_(local_scopes),
       places_(places),
       fetch_ctxs_(places),
-      use_event_(use_event) {}
+      use_event_(use_event),
+      running_ops_(0),
+      allow_op_delay_(allow_op_delay) {}
+
+void ThreadedSSAGraphExecutor::RunDelayedOps(
+    const std::unordered_set<OpHandleBase *> &delayed_ops) {
+  for (auto op : delayed_ops) {
+    op->Run(use_event_);
+  }
+}
 
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
-
   BlockingQueue<VarHandleBase *> ready_vars;
-
   std::unordered_set<OpHandleBase *> ready_ops;
+  // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
+  // streams from multiple GPUs, it's faster to buffer them and schedule
+  // together since we currently cannot overlap computation and memcpy streams.
+  // Should revisit it if overlapping is available.
+  std::unordered_set<OpHandleBase *> delayed_ops;
+  std::unordered_set<OpHandleBase *> blocked_by_delayed_ops;
+  std::unordered_set<VarHandleBase *> delayed_vars;
 
   auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
     pending_vars.insert(&var);
@@ -73,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
   // Step 2. Insert FetchOps
   std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
-  std::vector<DummyVarHandle> dummy_vars;
   FeedFetchList fetch_data(fetch_tensors.size());
 
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
@@ -87,13 +100,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     }
   }
 
+  std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
   for (size_t i = 0; i < fetch_tensors.size(); ++i) {
     auto &var_name = fetch_tensors[i];
     auto &vars = fetched_vars.at(var_name);
     auto *op = new FetchOpHandle(&fetch_data, i, &local_scopes_);
     fetch_ops.emplace_back(op);
 
-    // FIXME: Use new device context
     for (auto &p : places_) {
       op->dev_ctxes_[p] = fetch_ctxs_.Get(p);
     }
@@ -101,12 +114,24 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     for (auto *var : vars) {
       op->AddInput(var);
     }
+
+    auto *fetch_dummy = new DummyVarHandle();
+    op->AddOutput(fetch_dummy);
+    fetch_dependencies.emplace(fetch_dummy);
+    InsertPendingVar(*fetch_dummy);
     InsertPendingOp(*op);
   }
 
   auto run_all_ready_ops = [&] {
     for (auto *op : ready_ops) {
-      RunOp(ready_vars, op);
+      if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
+        delayed_ops.insert(op);
+        delayed_vars.insert(op->outputs_.begin(), op->outputs_.end());
+        ready_vars.Extend(op->outputs_);
+        continue;
+      }
+      running_ops_++;
+      RunOp(&ready_vars, op);
     }
     ready_ops.clear();
   };
@@ -118,13 +143,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   }
 
   // Step 3. Execution
-  while (!pending_vars.empty()) {
+  while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
     // 1. Run All Ready ops
     run_all_ready_ops();
 
     // 2. Find ready variable
     bool timeout;
-    auto cur_ready_vars = ready_vars.PopAll(1000, &timeout);
+    auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
       if (exception_) {
@@ -141,13 +166,29 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
-          ready_ops.insert(op);
+          if (delayed_vars.find(ready_var) != delayed_vars.end()) {
+            blocked_by_delayed_ops.insert(op);
+          } else {
+            ready_ops.insert(op);
+          }
         }
       }
     }
+    // When there are no other ops to schedule, schedule buffered delayed
+    // ops and unblock other ops.
+    if (ready_ops.empty() && !delayed_ops.empty() && running_ops_ == 0) {
+      RunDelayedOps(delayed_ops);
+      delayed_ops.clear();
+      for (auto *op : blocked_by_delayed_ops) {
+        ready_ops.insert(op);
+      }
+      blocked_by_delayed_ops.clear();
+    }
     // Keep loop until all vars are ready.
   }
-
+  PADDLE_ENFORCE(ready_ops.empty());
+  PADDLE_ENFORCE(delayed_ops.empty());
+  PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
   ++computation_count_;
 
   auto sync_computation = [&] {
@@ -182,12 +223,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 }
 
 void ThreadedSSAGraphExecutor::RunOp(
-    BlockingQueue<VarHandleBase *> &ready_var_q, details::OpHandleBase *op) {
-  auto op_run = [&ready_var_q, op, this] {
+    BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
+  auto op_run = [ready_var_q, op, this] {
     try {
       VLOG(10) << op->Name() << " : " << op->DebugString();
       op->Run(use_event_);
-      ready_var_q.Extend(op->outputs_);
+      running_ops_--;
+      ready_var_q->Extend(op->outputs_);
     } catch (platform::EnforceNotMet ex) {
       exception_.reset(new platform::EnforceNotMet(ex));
     } catch (...) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 2ea57ac8f96bc9c2b5c98bcd25d9ce921c3683cd..79cfc26b461a39811a9a125e5aeac3492d967386 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -14,7 +14,12 @@
 
 #pragma once
 
-#include <chrono>
+#include <deque>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
 #include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
@@ -70,7 +75,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ThreadedSSAGraphExecutor(size_t num_threads, bool use_event,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::unique_ptr<SSAGraph> &&graph);
+                           std::unique_ptr<SSAGraph> &&graph,
+                           bool allow_op_delay);
 
   // Run a SSAGraph by a thread pool
   // Use topological sort algorithm
@@ -79,9 +85,11 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ~ThreadedSSAGraphExecutor() {}
 
  private:
-  void RunOp(BlockingQueue<VarHandleBase *> &ready_var_q,
+  void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q,
              details::OpHandleBase *op);
 
+  void RunDelayedOps(const std::unordered_set<OpHandleBase *> &delayed_ops);
+
  private:
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
@@ -89,6 +97,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   platform::DeviceContextPool fetch_ctxs_;
   const bool use_event_;
   std::unique_ptr<platform::EnforceNotMet> exception_;
+  std::atomic<int> running_ops_;
+  bool allow_op_delay_;
 
   size_t computation_count_{0};
   size_t max_async_computation{100};
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 893cc15f6c8b34fcfc33554f8ef48ffeb00cd75c..569dda17c6e91d5658c4f8b9ba0b8c8fbd966832 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -22,7 +22,7 @@
 namespace paddle {
 namespace framework {
 namespace details {
-struct OpHandleBase;
+class OpHandleBase;
 
 // VarHandleBase is the var node in the dependency graph.
 // A variable can only be generated by a single operator. i.e.
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 64c06687b6b905186d4efcc8441d3abef6323d53..c2ca1bbc78f3ebc6066df6b666720af0d1fbbf59 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -93,6 +93,43 @@ static void CheckTensorNANOrInf(const std::string& name,
                  "Tensor %s contains NAN", name);
 }
 
+void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
+                               int block_id) {
+  auto& global_block = pdesc.Block(block_id);
+
+  const Scope* ancestor_scope = scope;
+  while (ancestor_scope->parent()) {
+    ancestor_scope = ancestor_scope->parent();
+  }
+
+  if (ancestor_scope != scope) {
+    for (auto& var : global_block.AllVars()) {
+      if (var->Name() == framework::kEmptyVarName) {
+        continue;
+      }
+
+      if (var->Persistable()) {
+        auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " global, which pointer is " << ptr;
+      } else {
+        auto* ptr = scope->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " locally, which pointer is " << ptr;
+      }
+    }
+  } else {
+    for (auto& var : global_block.AllVars()) {
+      auto* ptr = scope->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+              << ptr;
+    }
+  }
+}
+
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool create_local_scope, bool create_vars) {
   platform::RecordBlock b(block_id);
@@ -184,8 +221,8 @@ static bool has_fetch_operators(
 void Executor::Run(const ProgramDesc& program, Scope* scope,
                    std::map<std::string, const LoDTensor*>& feed_targets,
                    std::map<std::string, LoDTensor*>& fetch_targets,
-                   const std::string& feed_holder_name,
-                   const std::string& fetch_holder_name, bool create_vars) {
+                   bool create_vars, const std::string& feed_holder_name,
+                   const std::string& fetch_holder_name) {
   platform::RecordBlock b(kProgramId);
   bool has_feed_ops =
       has_feed_operators(program.Block(0), feed_targets, feed_holder_name);
@@ -279,40 +316,30 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
   return std::unique_ptr<ExecutorPrepareContext>(ctx);
 }
 
+std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
+    const ProgramDesc& program, const std::vector<int>& block_ids) {
+  std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
+  for (auto& bid : block_ids) {
+    auto* ctx = new ExecutorPrepareContext(program, bid);
+    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
+    auto& block = program.Block(bid);
+    for (auto& op_desc : block.AllOps()) {
+      ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
+    }
+    result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
+  }
+  return result;
+}
+
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                   bool create_local_scope, bool create_vars) {
-  auto& block = ctx->prog_.Block(ctx->block_id_);
-
   Scope* local_scope = scope;
   if (create_vars) {
     if (create_local_scope) {
       local_scope = &scope->NewScope();
-      for (auto& var : block.AllVars()) {
-        if (var->Name() == framework::kEmptyVarName) {
-          continue;
-        }
-
-        if (var->Persistable()) {
-          auto* ptr = scope->Var(var->Name());
-          InitializeVariable(ptr, var->GetType());
-          VLOG(3) << "Create Variable " << var->Name()
-                  << " global, which pointer is " << ptr;
-        } else {
-          auto* ptr = local_scope->Var(var->Name());
-          InitializeVariable(ptr, var->GetType());
-          VLOG(3) << "Create Variable " << var->Name()
-                  << " locally, which pointer is " << ptr;
-        }
-      }
-    } else {
-      for (auto& var : block.AllVars()) {
-        auto* ptr = local_scope->Var(var->Name());
-        InitializeVariable(ptr, var->GetType());
-        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-                << ptr;
-      }
-    }  // if (create_local_scope)
-  }    // if (create_vars)
+    }
+    CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
+  }
 
   for (auto& op : ctx->ops_) {
     VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 7173c51c95e04ad3095f01bb24923a7a3341c517..75b29b2f4065ad75b62a134b890b8f9f6730fdc7 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -54,13 +54,18 @@ class Executor {
   void Run(const ProgramDesc& program, Scope* scope,
            std::map<std::string, const LoDTensor*>& feed_targets,
            std::map<std::string, LoDTensor*>& fetch_targets,
+           bool create_vars = true,
            const std::string& feed_holder_name = "feed",
-           const std::string& fetch_holder_name = "fetch",
-           bool create_vars = true);
+           const std::string& fetch_holder_name = "fetch");
 
   static std::unique_ptr<ExecutorPrepareContext> Prepare(
       const ProgramDesc& program, int block_id);
 
+  static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
+      const ProgramDesc& program, const std::vector<int>& block_ids);
+
+  void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
+
   void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                           bool create_local_scope = true,
                           bool create_vars = true);
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index dee505fee0dccd8d60bb290a8bec4df243e504a2..4f130d265900483ec7a7c541f2610d17a352913f 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -142,6 +142,7 @@ class LoDTensor : public Tensor {
     return (lod_)[level].size() - 1;
   }
 
+  // Split LoDTensor and copy to each place specified in places.
   std::vector<LoDTensor> SplitLoDTensor(
       const std::vector<platform::Place> places) const;
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f6a43804ef2fd73c4a2c2c3b3dfbb90bff1c451b..a3b4a8c0829ae3324e933309b2eaea35fe571997 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -35,6 +35,17 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
     std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
 };
 
+proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
+  if (var->IsType<framework::LoDTensor>()) {
+    return framework::ToDataType(var->Get<framework::LoDTensor>().type());
+  } else if (var->IsType<framework::SelectedRows>()) {
+    return framework::ToDataType(
+        var->Get<framework::SelectedRows>().value().type());
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+  }
+}
+
 static DDim GetDims(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 41214b41cb68cbd7049552f39195ae5257e0d06f..b7a7c69b4c8493f945926c75797c49d327a3197e 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -61,6 +61,8 @@ inline std::string GradVarName(const std::string& var_name) {
   return var_name + kGradVarSuffix;
 }
 
+proto::VarType::Type GetDataTypeOfVar(const Variable* var);
+
 class OperatorBase;
 class ExecutionContext;
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 91f2db9354c2a00ec7e51ea4595c7cfa00da23ea..74945fb4f2f745b6ca9c48adb0c8b9e6ae1e94a4 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/parallel_executor.h"
-#include <string>
 
-#include "ThreadPool.h"
+#include <string>
+#include <vector>
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
@@ -23,6 +23,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -42,30 +43,40 @@ class ParallelExecutorPrivate {
 #endif
 };
 
+std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
+  return member_->local_scopes_;
+}
+
 ParallelExecutor::ParallelExecutor(
     size_t num_threads, bool use_event,
     const std::vector<platform::Place> &places,
     const std::unordered_set<std::string> &params,
-    const ProgramDesc &startup_program, const ProgramDesc &main_program,
-    const std::string &loss_var_name, Scope *scope)
+    const std::unordered_set<std::string> &bcast_vars,
+    const ProgramDesc &main_program, const std::string &loss_var_name,
+    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
 
-  // Step 1. RunStartupProgram and Bcast the params to devs.
-  Executor exe(places[0]);
-  exe.Run(startup_program, scope, 0);
+  // Step 1. Bcast the params to devs.
   // Create local scopes
-  for (size_t i = 0; i < member_->places_.size(); ++i) {
-    member_->local_scopes_.push_back(&scope->NewScope());
+  if (local_scopes.empty()) {
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.push_back(&scope->NewScope());
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.push_back(local_scopes[i]);
+    }
   }
 
 // Bcast Parameters to all GPUs
 #ifdef PADDLE_WITH_CUDA
   member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
 #endif
-  if (platform::is_gpu_place(places[0]) &&
-      member_->local_scopes_.size() != 1) {  // Is CUDA
-    BCastParamsToGPUs(startup_program);
+  if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
+      local_scopes.empty()) {  // Is CUDA
+    BCastParamsToGPUs(bcast_vars);
   }
 // Startup Program has been run. All local scopes has correct parameters.
 
@@ -82,8 +93,8 @@ ParallelExecutor::ParallelExecutor(
   auto graph = builder.Build(main_program);
 
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      num_threads, use_event, member_->local_scopes_, places,
-      std::move(graph)));
+      num_threads, use_event, member_->local_scopes_, places, std::move(graph),
+      allow_op_delay));
 
   // Step 3. Create vars in each scope;
   for (auto *scope : member_->local_scopes_) {
@@ -98,48 +109,47 @@ ParallelExecutor::ParallelExecutor(
 }
 
 void ParallelExecutor::BCastParamsToGPUs(
-    const ProgramDesc &startup_program) const {
+    const std::unordered_set<std::string> &vars) const {
 #ifdef PADDLE_WITH_CUDA
   auto *main_scope = member_->local_scopes_[0];
 
-  for (auto *var_desc : startup_program.Block(0).AllVars()) {
-    size_t idx = var_desc->Name().find("@GRAD");
-    if (idx != std::string::npos) continue;
-    if (var_desc->GetType() == proto::VarType::LOD_TENSOR) {
-      auto &main_tensor =
-          main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
-
-      auto &dims = main_tensor.dims();
-
-      if (paddle::platform::is_gpu_place(main_tensor.place())) {
-        size_t numel = main_tensor.numel();
-        ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-        platform::NCCLGroupGuard guard;
-        for (size_t i = 0; i < member_->places_.size(); ++i) {
-          auto place = member_->places_[i];
-          void *buffer;
-          if (i == 0) {
-            buffer = const_cast<void *>(main_tensor.data<void>());
-          } else {
-            auto local_scope = member_->local_scopes_[i];
-            auto *t =
-                local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
-            t->Resize(dims);
-            buffer = t->mutable_data(place, main_tensor.type());
-          }
-          auto &nccl_ctx = member_->nccl_ctxs_->at(place);
-          platform::dynload::ncclBcast(buffer, numel, data_type, 0,
-                                       nccl_ctx.comm_, nccl_ctx.stream());
-        }
-      } else {
-        platform::CPUPlace cpu;
-        for (size_t i = 1; i < member_->places_.size(); ++i) {
+  for (auto &var : vars) {
+    auto *main_var = main_scope->FindVar(var);
+    if (!main_var->IsType<LoDTensor>()) {
+      continue;
+    }
+
+    auto &main_tensor = main_var->Get<LoDTensor>();
+
+    auto &dims = main_tensor.dims();
+
+    if (paddle::platform::is_gpu_place(main_tensor.place())) {
+      size_t numel = main_tensor.numel();
+      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
+      platform::NCCLGroupGuard guard;
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        auto place = member_->places_[i];
+        void *buffer;
+        if (i == 0) {
+          buffer = const_cast<void *>(main_tensor.data<void>());
+        } else {
           auto local_scope = member_->local_scopes_[i];
-          auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
+          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
           t->Resize(dims);
-          t->mutable_data(cpu, main_tensor.type());
-          paddle::framework::TensorCopy(main_tensor, cpu, t);
+          buffer = t->mutable_data(place, main_tensor.type());
         }
+        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
+        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
+                                     nccl_ctx.comm_, nccl_ctx.stream());
+      }
+    } else {
+      platform::CPUPlace cpu;
+      for (size_t i = 1; i < member_->places_.size(); ++i) {
+        auto local_scope = member_->local_scopes_[i];
+        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
+        t->Resize(dims);
+        t->mutable_data(cpu, main_tensor.type());
+        paddle::framework::TensorCopy(main_tensor, cpu, t);
       }
     }
     member_->nccl_ctxs_->WaitAll();
@@ -149,12 +159,30 @@ void ParallelExecutor::BCastParamsToGPUs(
 #endif
 }
 
-void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
-                           const std::string &fetched_var_name) {
+void ParallelExecutor::Run(
+    const std::vector<std::string> &fetch_tensors,
+    const std::string &fetched_var_name,
+    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
+  platform::RecordBlock b(0);
+  SplitTensorToPlaces(feed_tensors);
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
 }
 
+void ParallelExecutor::SplitTensorToPlaces(
+    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
+  for (auto it : feed_tensors) {
+    auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
+    for (size_t j = 0; j < member_->places_.size(); ++j) {
+      // TODO(panxy0718): Do I need to delete this var?
+      member_->local_scopes_[j]
+          ->Var(it.first)
+          ->GetMutable<LoDTensor>()
+          ->ShareDataWith(lod_tensors[j]);
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 503efa2e447b0ac70f6302aa0a89cc55e5afcb81..c048c3865f14822be4a0015e385ea1b8e05d0ced 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,8 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include <future>
+#include <string>
 #include <unordered_set>
+#include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -35,17 +36,25 @@ class ParallelExecutor {
   explicit ParallelExecutor(size_t num_threads, bool use_event,
                             const std::vector<platform::Place>& places,
                             const std::unordered_set<std::string>& params,
-                            const ProgramDesc& startup_program,
+                            const std::unordered_set<std::string>& bcast_vars,
                             const ProgramDesc& main_program,
-                            const std::string& loss_var_name, Scope* scope);
+                            const std::string& loss_var_name, Scope* scope,
+                            const std::vector<Scope*>& local_scopes,
+                            bool allow_op_delay);
+
+  std::vector<Scope*>& GetLocalScopes();
 
   void Run(const std::vector<std::string>& fetch_tensors,
-           const std::string& fetched_var_name = "fetched_var");
+           const std::string& fetched_var_name,
+           const std::unordered_map<std::string, LoDTensor>& feed_tensors);
 
  private:
+  void SplitTensorToPlaces(
+      const std::unordered_map<std::string, LoDTensor>& feed_tensors);
+
   ParallelExecutorPrivate* member_;
 
-  void BCastParamsToGPUs(const ProgramDesc& startup_program) const;
+  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 17e38b1cf042657834b4d0d1c12cbbb92f19fa45..194df3e4a8b50700e2be01ce5ebca83b92501fb8 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 
 #include <memory>  // for unique_ptr
-#include <mutex>   // for call_once
 #include <set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/threadpool.h"
@@ -39,6 +38,7 @@ Scope::~Scope() {
 }
 
 Scope& Scope::NewScope() const {
+  std::unique_lock<std::mutex> lock(mutex_);
   kids_.push_back(new Scope(this));
   return *kids_.back();
 }
@@ -92,6 +92,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 
 void Scope::DeleteScope(Scope* scope) {
+  std::unique_lock<std::mutex> lock(mutex_);
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
   this->kids_.erase(it);
@@ -103,7 +104,7 @@ void Scope::DeleteScope(Scope* scope) {
   }
 }
 
-void Scope::EraseVars(std::vector<std::string>& var_names) {
+void Scope::EraseVars(const std::vector<std::string>& var_names) {
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index c1e1f49caaa5a60df0e97289aada465b45213971..c8cb70549f1d131b66fa7c6eeb35f3b7151a9e7f 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <list>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -51,13 +52,13 @@ class Scope {
   /// Create a variable with a scope-unique name.
   Variable* Var(std::string* name = nullptr);
 
-  void EraseVars(std::vector<std::string>& var_names);
+  void EraseVars(const std::vector<std::string>& var_names);
 
   /// Find a variable in the scope or any of its ancestors.  Returns
   /// nullptr if cannot find.
   Variable* FindVar(const std::string& name) const;
 
-  const Scope& parent() const { return *parent_; }
+  const Scope* parent() const { return parent_; }
 
   /// Find the scope or an ancestor scope that contains the given variable.
   const Scope* FindScope(const Variable* var) const;
@@ -88,6 +89,9 @@ class Scope {
   Scope const* parent_{nullptr};
 
   DISABLE_COPY_AND_ASSIGN(Scope);
+
+ private:
+  mutable std::mutex mutex_;
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 504344e937dfdc362cdc22298a5f963d87011e9d..d9d6b7dd67f1c6e4bbd6a4e1a8f0843d4cb93c05 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -1,8 +1,11 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,6 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
                        const platform::DeviceContext& dev_ctx) {
   {  // the 1st field, uint32_t version
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index c9c2c1bb721f2c527fa52f45cc54883f639f4ef8..8e2d9470d3954e0f66c74828a8d8292c2875a8f4 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -1,8 +1,11 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -10,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 
@@ -44,6 +50,15 @@ class SelectedRows {
 
   void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
 
+  /**
+   * get the index of id in rows
+   */
+  int64_t index(int64_t id) const {
+    auto it = std::find(rows_.begin(), rows_.end(), id);
+    PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+
   DDim GetCompleteDims() const {
     std::vector<int64_t> dims = vectorize(value_->dims());
     dims[0] = height_;
@@ -52,7 +67,7 @@ class SelectedRows {
 
  private:
   // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
-  // SelectedRows are simplely concated when adding together. Until a
+  // SelectedRows are simply concated when adding together. Until a
   // SelectedRows add a Tensor, will the duplicate rows be handled.
   Vector<int64_t> rows_;
   std::unique_ptr<Tensor> value_{nullptr};
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index f7a6b5ba84ca1762bd903790aa3c0346b22ed035..6f878541e6de1deec1829145b1b325ecd176a034 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -45,11 +45,10 @@ class Tensor {
   friend struct EigenVector;
 
  public:
-  Tensor() : offset_(0), is_pinned_(false) {}
+  Tensor() : offset_(0) {}
 
   /*! Constructor with place should only be used in pybind. */
-  explicit Tensor(const platform::Place& place)
-      : offset_(0), is_pinned_(false) {
+  explicit Tensor(const platform::Place& place) : offset_(0) {
     holder_->set_place(place);
   }
 
@@ -70,12 +69,11 @@ class Tensor {
    * @note    If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(platform::Place place, bool is_pinned = false);
+  inline T* mutable_data(platform::Place place);
 
-  inline void* mutable_data(platform::Place place, std::type_index type,
-                            bool is_pinned = false);
+  inline void* mutable_data(platform::Place place, std::type_index type);
 
-  inline void* mutable_data(platform::Place place, bool is_pinned = false);
+  inline void* mutable_data(platform::Place place);
 
   /**
    * @brief     Return a pointer to mutable memory block.
@@ -86,8 +84,7 @@ class Tensor {
    * @note      If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place,
-                         bool is_pinned = false);
+  inline T* mutable_data(DDim dims, platform::Place place);
 
   /*! Return the dimensions of the memory block. */
   inline const DDim& dims() const;
@@ -95,9 +92,6 @@ class Tensor {
   /*! Return the numel of the memory block. */
   inline int64_t numel() const;
 
-  /*! Return the numel of the memory block. */
-  inline bool isPinned() const;
-
   /*! Resize the dimensions of the memory block. */
   inline Tensor& Resize(const DDim& dims);
 
@@ -152,14 +146,12 @@ class Tensor {
 
   template <typename Place>
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(Place place, size_t size, std::type_index type,
-                    bool is_pinned = false)
-        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)),
-               memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
+    PlaceholderImpl(Place place, size_t size, std::type_index type)
+        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
+               memory::PODDeleter<uint8_t, Place>(place)),
           place_(place),
           size_(size),
-          type_(type),
-          is_pinned_(is_pinned) {
+          type_(type) {
       PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                               (is_cpu_place(place_) ? "CPU" : "GPU"));
     }
@@ -182,9 +174,6 @@ class Tensor {
 
     /* the current type of memory */
     std::type_index type_;
-
-    /*! use pinned memory or not. */
-    bool is_pinned_;
   };
 
   /*! holds the memory block if allocated. */
@@ -219,7 +208,6 @@ class Tensor {
    *          PlaceHolder::ptr_ and where the tensor data really begins.
    */
   size_t offset_;
-  bool is_pinned_;
 };
 
 inline void Tensor::switch_place(platform::Place new_place) {
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 113814971e115fa88bd0ded34017fa26a9dd5803..f49d1a47a325b2aac6185073203df124be18b54d 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -101,21 +101,19 @@ inline T* Tensor::data() {
 }
 
 template <typename T>
-inline T* Tensor::mutable_data(DDim dims, platform::Place place,
-                               bool is_pinned) {
+inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
   static_assert(std::is_pod<T>::value, "T must be POD");
   Resize(dims);
-  return mutable_data<T>(place, is_pinned);
+  return mutable_data<T>(place);
 }
 
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
+inline T* Tensor::mutable_data(platform::Place place) {
   static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned));
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
 }
 
-inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
-                                  bool is_pinned) {
+inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
   if (holder_ != nullptr) {
     holder_->set_type(type);
   }
@@ -129,27 +127,33 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
       holder_->size() < size + offset_) {
     if (platform::is_cpu_place(place)) {
       holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type, is_pinned));
-    } else if (platform::is_gpu_place(place)) {
+          boost::get<platform::CPUPlace>(place), size, type));
+    } else if (platform::is_gpu_place(place) ||
+               platform::is_cuda_pinned_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+      PADDLE_THROW(
+          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
     }
 #else
-      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-          boost::get<platform::CUDAPlace>(place), size, type, is_pinned));
+      if (platform::is_gpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+            boost::get<platform::CUDAPlace>(place), size, type));
+      } else if (platform::is_cuda_pinned_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
+            boost::get<platform::CUDAPinnedPlace>(place), size, type));
+      }
     }
 #endif
     offset_ = 0;
-    is_pinned_ = is_pinned;
   }
   return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                  offset_);
 }
 
-inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
+inline void* Tensor::mutable_data(platform::Place place) {
   PADDLE_ENFORCE(this->holder_ != nullptr,
-                 "Cannot invoke mutable data if current hold nothing");
-  return mutable_data(place, holder_->type(), is_pinned);
+                 "Cannot invoke mutable data if current hold nothing.");
+  return mutable_data(place, holder_->type());
 }
 
 inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
@@ -191,8 +195,6 @@ inline const DDim& Tensor::dims() const { return dims_; }
 
 inline int64_t Tensor::numel() const { return product(dims_); }
 
-inline bool Tensor::isPinned() const { return is_pinned_; }
-
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   Tensor res;
   res.ShareDataWith(src);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 8b7533ce712b0a01060842b6f71449ed6bd23e2c..1d864af011bced9df188147ec436b8de12947ba9 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -148,6 +148,11 @@ struct AnyVisitor : public boost::static_visitor<bool> {
                  const platform::CPUPlace& cpu) const {
     return *out.data<bool>();
   }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CUDAPinnedPlace& cpu) const {
+    return *out.data<bool>();
+  }
 };
 
 template <typename Predicate>
diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h
index 78996908b18a5a0935d8de9920e8ccef9069e74b..f6c6a1fec13d8b12efd1fa71a7a93316e484d045 100644
--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -35,24 +35,25 @@ class Tuple {
  public:
   using ElementVars = std::vector<ElementVar>;
 
-  Tuple(std::vector<ElementVar>& var, std::vector<VarDesc>& var_desc)
+  Tuple(const std::vector<ElementVar>& var,
+        const std::vector<VarDesc>& var_desc)
       : var_(var), var_desc_(var_desc) {}
-  Tuple(std::vector<ElementVar>& var) : var_(var) {}
+  explicit Tuple(std::vector<ElementVar>& var) : var_(var) {}
 
-  ElementVar get(int idx) const { return var_[idx]; };
+  ElementVar get(int idx) const { return var_[idx]; }
 
-  ElementVar& get(int idx) { return var_[idx]; };
+  ElementVar& get(int idx) { return var_[idx]; }
 
-  bool isSameType(Tuple& t) const;
+  bool isSameType(const Tuple& t) const;
 
-  size_t getSize() const { return var_.size(); };
+  size_t getSize() const { return var_.size(); }
 
  private:
   ElementVars var_;
   std::vector<VarDesc> var_desc_;
 };
 
-bool Tuple::isSameType(Tuple& t) const {
+bool Tuple::isSameType(const Tuple& t) const {
   size_t tuple_size = getSize();
   if (tuple_size != t.getSize()) {
     return false;
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index aff427310f15be72f5c8d0fa1537ffa6bbe2881d..f417f62f3f75360f4ae1b7795608ae95200cfeb8 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init)
 
 cc_library(paddle_fluid_api
     SRCS io.cc
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 52e9c0baa64508f82d0a86a88c8c5f8d23f9f7f2..a5b62ef322bfad0fc956d7d722797bd5add6aea6 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -41,8 +41,7 @@ bool IsPersistable(const framework::VarDesc* var) {
   return false;
 }
 
-void LoadPersistables(framework::Executor& executor,
-                      framework::Scope& scope,
+void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
                       const std::string& param_filename) {
@@ -108,10 +107,8 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
 }
 
 std::unique_ptr<framework::ProgramDesc> Load(
-    framework::Executor& executor,
-    framework::Scope& scope,
-    const std::string& prog_filename,
-    const std::string& param_filename) {
+    framework::Executor& executor, framework::Scope& scope,
+    const std::string& prog_filename, const std::string& param_filename) {
   std::string model_filename = prog_filename;
   std::string program_desc_str;
   ReadBinaryFile(model_filename, program_desc_str);
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index 6817a6fca047c9336233697a7bee4e5e16eedd5e..d07d315b93ef10a464080899b1cb9920abe83be3 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -24,8 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 
-void LoadPersistables(framework::Executor& executor,
-                      framework::Scope& scope,
+void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
                       const std::string& param_filename);
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index e7ffb00ec8d8926193fe510ebdb7185f75c90906..6ed77adb9d891c75e7de358d0d7a0c06c9af96dd 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -4,7 +4,7 @@ function(inference_test TARGET_NAME)
   set(multiValueArgs ARGS)
   cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/fluid/tests)
+  set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
   set(arg_list "")
   if(inference_test_ARGS)
     foreach(arg ${inference_test_ARGS})
diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
index 9ab808efec3abdb86724fb16725962958c5cf55c..3e77dc166c355bc141563eda4705ca8d75226ac4 100644
--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -30,8 +30,8 @@ TEST(inference, fit_a_line) {
   // The second dim of the input tensor should be 13
   // The input data should be >= 0
   int64_t batch_size = 10;
-  SetupTensor<float>(
-      input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
+  SetupTensor<float>(&input, {batch_size, 13}, static_cast<float>(0),
+                     static_cast<float>(10));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
index e9a27171f1cd68e7b10c860fb4a1417b930ed565..ca2077d07411d2cd6095e0dc2a874af0890145c5 100644
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -35,10 +35,8 @@ TEST(inference, image_classification) {
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [0.0, 1.0].
-  SetupTensor<float>(input,
-                     {FLAGS_batch_size, 3, 32, 32},
-                     static_cast<float>(0),
-                     static_cast<float>(1));
+  SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
+                     static_cast<float>(0), static_cast<float>(1));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
@@ -48,8 +46,8 @@ TEST(inference, image_classification) {
 
   // Run inference on CPU
   LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace>(
-      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
+  TestInference<paddle::platform::CPUPlace, false>(dirname, cpu_feeds,
+                                                   cpu_fetchs1, FLAGS_repeat);
   LOG(INFO) << output1.dims();
 
 #ifdef PADDLE_WITH_CUDA
@@ -59,8 +57,8 @@ TEST(inference, image_classification) {
 
   // Run inference on CUDA GPU
   LOG(INFO) << "--- GPU Runs: ---";
-  TestInference<paddle::platform::CUDAPlace>(
-      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
+  TestInference<paddle::platform::CUDAPlace, false>(dirname, cpu_feeds,
+                                                    cpu_fetchs2, FLAGS_repeat);
   LOG(INFO) << output2.dims();
 
   CheckError<float>(output1, output2);
diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
index 184924016634bba26204d937744ca5fa87cd443c..84bb855fea5fa397ff71e2c922fea3302951b7ca 100644
--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -36,37 +36,21 @@ TEST(inference, label_semantic_roles) {
   int64_t predicate_dict_len = 3162;
   int64_t mark_dict_len = 2;
 
-  SetupLoDTensor(word,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&word, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(predicate,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&predicate, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(predicate_dict_len - 1));
-  SetupLoDTensor(ctx_n2,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_n2, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_n1,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_n1, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_0,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_0, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_p1,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_p1, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_p2,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_p2, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(mark,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&mark, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(mark_dict_len - 1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
index 1fb0f9e77797cf6e61e918700763ee33a495cb96..f12828a2685305c20d26492dbf04fa9ddacf9317 100644
--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -35,10 +35,8 @@ TEST(inference, recognize_digits) {
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [-1.0, 1.0].
-  SetupTensor<float>(input,
-                     {FLAGS_batch_size, 1, 28, 28},
-                     static_cast<float>(-1),
-                     static_cast<float>(1));
+  SetupTensor<float>(&input, {FLAGS_batch_size, 1, 28, 28},
+                     static_cast<float>(-1), static_cast<float>(1));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
@@ -49,8 +47,8 @@ TEST(inference, recognize_digits) {
 
     // Run inference on CPU
     LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
-    TestInference<paddle::platform::CPUPlace>(
-        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
+    TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
+                                              FLAGS_repeat, is_combined);
     LOG(INFO) << output1.dims();
 
 #ifdef PADDLE_WITH_CUDA
@@ -60,8 +58,8 @@ TEST(inference, recognize_digits) {
 
     // Run inference on CUDA GPU
     LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
-    TestInference<paddle::platform::CUDAPlace>(
-        dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
+    TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
+                                               FLAGS_repeat, is_combined);
     LOG(INFO) << output2.dims();
 
     CheckError<float>(output1, output2);
diff --git a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
index b42a33c9a90b5feafaed343a197da0e4db11b7ea..70aa6b194d4417fc85384cc3f615089f024f928e 100644
--- a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -36,25 +36,25 @@ TEST(inference, recommender_system) {
 
   // Use the first data from paddle.dataset.movielens.test() as input
   std::vector<int64_t> user_id_data = {1};
-  SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
+  SetupTensor<int64_t>(&user_id, {batch_size, 1}, user_id_data);
 
   std::vector<int64_t> gender_id_data = {1};
-  SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
+  SetupTensor<int64_t>(&gender_id, {batch_size, 1}, gender_id_data);
 
   std::vector<int64_t> age_id_data = {0};
-  SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
+  SetupTensor<int64_t>(&age_id, {batch_size, 1}, age_id_data);
 
   std::vector<int64_t> job_id_data = {10};
-  SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
+  SetupTensor<int64_t>(&job_id, {batch_size, 1}, job_id_data);
 
   std::vector<int64_t> movie_id_data = {783};
-  SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
+  SetupTensor<int64_t>(&movie_id, {batch_size, 1}, movie_id_data);
 
   std::vector<int64_t> category_id_data = {10, 8, 9};
-  SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
+  SetupLoDTensor<int64_t>(&category_id, {3, 1}, {{0, 3}}, category_id_data);
 
   std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
-  SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
+  SetupLoDTensor<int64_t>(&movie_title, {5, 1}, {{0, 5}}, movie_title_data);
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&user_id);
diff --git a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
index a0523905bd1631cd8768b1601e459cb9d110a84d..e15c3f59acb1eac535120554a3799c37e9d4e951 100644
--- a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -32,10 +32,10 @@ TEST(inference, rnn_encoder_decoder) {
   paddle::framework::LoDTensor word_data, trg_word;
   paddle::framework::LoD lod{{0, 4, 10}};
 
-  SetupLoDTensor(
-      word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(
-      trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(&word_data, lod, static_cast<int64_t>(0),
+                 static_cast<int64_t>(1));
+  SetupLoDTensor(&trg_word, lod, static_cast<int64_t>(0),
+                 static_cast<int64_t>(1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&word_data);
diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
index 824b3274ebc7ba046e61798b3f61ef9924a75679..0dbb6a30405eb64133613052ad57b1f705a9e7b4 100644
--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -33,9 +33,7 @@ TEST(inference, understand_sentiment) {
   paddle::framework::LoD lod{{0, 4, 10}};
   int64_t word_dict_len = 5147;
 
-  SetupLoDTensor(words,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
index 1481760c529c29a7290f476e2a22e1ded5ab7787..c9328eb21b4fdb06c5f65ba0f7337b1e79fa1927 100644
--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -33,10 +33,10 @@ TEST(inference, word2vec) {
   paddle::framework::LoD lod{{0, 1}};
   int64_t dict_size = 2073;  // The size of dictionary
 
-  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&first_word);
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index dce541c0971a6ff9a3728e915fe8c3d009c23550..aae34ceda07fea6e881cf61b3755ec45d1d6f2dc 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -11,59 +11,59 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once
+
+#include <map>
+#include <random>
+#include <string>
+#include <vector>
 
-#include <time.h>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"
 
 template <typename T>
-void SetupTensor(paddle::framework::LoDTensor& input,
-                 paddle::framework::DDim dims,
-                 T lower,
-                 T upper) {
-  srand(time(0));
-  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
-  for (int i = 0; i < input.numel(); ++i) {
-    input_ptr[i] =
-        (static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
-        lower;
+void SetupTensor(paddle::framework::LoDTensor* input,
+                 paddle::framework::DDim dims, T lower, T upper) {
+  std::mt19937 rng(100);  // An arbitrarily chosen but fixed seed.
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+
+  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
+  for (int i = 0; i < input->numel(); ++i) {
+    input_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
   }
 }
 
 template <typename T>
-void SetupTensor(paddle::framework::LoDTensor& input,
-                 paddle::framework::DDim dims,
-                 std::vector<T>& data) {
+void SetupTensor(paddle::framework::LoDTensor* input,
+                 paddle::framework::DDim dims, const std::vector<T>& data) {
   CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size()));
-  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
-  memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
+  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
+  memcpy(input_ptr, data.data(), input->numel() * sizeof(T));
 }
 
 template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor& input,
-                    paddle::framework::LoD& lod,
-                    T lower,
-                    T upper) {
-  input.set_lod(lod);
+void SetupLoDTensor(paddle::framework::LoDTensor* input,
+                    const paddle::framework::LoD& lod, T lower, T upper) {
+  input->set_lod(lod);
   int dim = lod[0][lod[0].size() - 1];
   SetupTensor<T>(input, {dim, 1}, lower, upper);
 }
 
 template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor& input,
+void SetupLoDTensor(paddle::framework::LoDTensor* input,
                     paddle::framework::DDim dims,
-                    paddle::framework::LoD lod,
-                    std::vector<T>& data) {
+                    const paddle::framework::LoD lod,
+                    const std::vector<T>& data) {
   const size_t level = lod.size() - 1;
   CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back()));
-  input.set_lod(lod);
+  input->set_lod(lod);
   SetupTensor<T>(input, dims, data);
 }
 
 template <typename T>
-void CheckError(paddle::framework::LoDTensor& output1,
-                paddle::framework::LoDTensor& output2) {
+void CheckError(const paddle::framework::LoDTensor& output1,
+                const paddle::framework::LoDTensor& output2) {
   // Check lod information
   EXPECT_EQ(output1.lod(), output2.lod());
 
@@ -88,12 +88,11 @@ void CheckError(paddle::framework::LoDTensor& output1,
   EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
 }
 
-template <typename Place>
+template <typename Place, bool CreateVars = true>
 void TestInference(const std::string& dirname,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
-                   const int repeat = 1,
-                   const bool is_combined = false) {
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
+                   const int repeat = 1, const bool is_combined = false) {
   // 1. Define place, executor, scope
   auto place = Place();
   auto executor = paddle::framework::Executor(place);
@@ -132,11 +131,9 @@ void TestInference(const std::string& dirname,
       //  `fluid.io.save_inference_model`.
       std::string prog_filename = "__model_combined__";
       std::string param_filename = "__params_combined__";
-      inference_program =
-          paddle::inference::Load(executor,
-                                  *scope,
-                                  dirname + "/" + prog_filename,
-                                  dirname + "/" + param_filename);
+      inference_program = paddle::inference::Load(
+          executor, *scope, dirname + "/" + prog_filename,
+          dirname + "/" + param_filename);
     } else {
       // Parameters are saved in separate files sited in the specified
       // `dirname`.
@@ -169,8 +166,16 @@ void TestInference(const std::string& dirname,
 
   // 6. Run the inference program
   {
+    if (!CreateVars) {
+      // If users don't want to create and destroy variables every time they
+      // run, they need to set `create_vars` to false and manually call
+      // `CreateVariables` before running.
+      executor.CreateVariables(*inference_program, scope, 0);
+    }
+
     // Ignore the profiling results of the first run
-    executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    executor.Run(*inference_program, scope, feed_targets, fetch_targets,
+                 CreateVars);
 
     // Enable the profiler
     paddle::platform::EnableProfiler(state);
@@ -181,7 +186,8 @@ void TestInference(const std::string& dirname,
           "run_inference",
           paddle::platform::DeviceContextPool::Instance().Get(place));
 
-      executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+      executor.Run(*inference_program, scope, feed_targets, fetch_targets,
+                   CreateVars);
     }
 
     // Disable the profiler and print the timing information
diff --git a/paddle/fluid/memory/.clang-format b/paddle/fluid/memory/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 1a61c484823b292234d4758cdc1959d7a21510e6..709fc7e12e1db537ceece30c405c0e8a2582e8ca 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -1,16 +1,15 @@
 add_subdirectory(detail)
 
-cc_library(memory SRCS memory.cc DEPS place enforce)
+cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce)
 cc_library(memcpy SRCS memcpy.cc DEPS place)
 
-cc_library(paddle_memory
-    DEPS
-    memory
-    memcpy
-    meta_data
-    meta_cache
-    memory_block
-    buddy_allocator
-    system_allocator)
+cc_library(memory
+        DEPS
+        malloc
+        memcpy)
 
-cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
+cc_test(malloc_test SRCS malloc_test.cc DEPS malloc)
+
+#if (WITH_GPU)
+#   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
+#endif()
diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
index b9c3fc31c1523abf3acbd116745bbf1596454aac..c725dba5e98c200c2542d97cb8f53a938f6b614a 100644
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@@ -1,3 +1,5 @@
+cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc)
+
 if(${WITH_GPU})
   nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
@@ -6,10 +8,4 @@ endif(${WITH_GPU})
 
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
 
-cc_library(meta_data SRCS meta_data.cc)
-
-cc_library(meta_cache SRCS meta_cache.cc)
-
-cc_library(memory_block SRCS memory_block.cc)
-
-cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 876837838648d6733b104a5496454f5dc58bbb71..4194ba197948b47003863196efdac1c08a7ae4f6 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -46,7 +46,8 @@ inline size_t align(size_t size, size_t alignment) {
 
 void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // adjust allocation alignment
-  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+  size_t size =
+      align(unaligned_size + sizeof(MemoryBlock::Desc), min_chunk_size_);
 
   // acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
@@ -103,7 +104,7 @@ void BuddyAllocator::Free(void* p) {
     return;
   }
 
-  block->mark_as_free(cache_);
+  block->mark_as_free(&cache_);
 
   total_used_ -= block->total_size(cache_);
   total_free_ += block->total_size(cache_);
@@ -122,7 +123,7 @@ void BuddyAllocator::Free(void* p) {
                                    right_buddy));
 
       // merge its right buddy to the block
-      block->merge(cache_, right_buddy);
+      block->merge(&cache_, right_buddy);
     }
   }
 
@@ -139,7 +140,7 @@ void BuddyAllocator::Free(void* p) {
                                    left_buddy->total_size(cache_), left_buddy));
 
       // merge the block to its left buddy
-      left_buddy->merge(cache_, block);
+      left_buddy->merge(&cache_, block);
       block = left_buddy;
     }
   }
@@ -163,13 +164,13 @@ size_t BuddyAllocator::Used() { return total_used_; }
 
 void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
-  void* p = system_allocator_->Alloc(index, size);
+  void* p = system_allocator_->Alloc(&index, size);
 
   VLOG(10) << "Allocated " << p << " from system allocator.";
 
   if (p == nullptr) return nullptr;
 
-  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
                                      size, nullptr, nullptr);
 
   return static_cast<MemoryBlock*>(p)->data();
@@ -187,14 +188,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
 
   // Allocate a new maximum sized block
   size_t index = 0;
-  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+  void* p = system_allocator_->Alloc(&index, max_chunk_size_);
 
   if (p == nullptr) return pool_.end();
 
   VLOG(10) << "Creating and inserting new block " << p
            << " from system allocator";
 
-  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
                                      max_chunk_size_, nullptr, nullptr);
 
   // gpu fallback allocation
@@ -238,11 +239,11 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
 
   VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
            << ") into";
-  block->split(cache_, size);
+  block->split(&cache_, size);
 
   VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
            << ")";
-  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+  block->set_type(&cache_, MemoryBlock::ARENA_CHUNK);
 
   // the rest of memory if exist
   if (block->has_right_buddy(cache_)) {
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index a4ee70c2586f37e3b2328dedfe28135e14d8b18d..2f39d774d6fb6a2bc37877eb2f8b90bebd3cda28 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -14,18 +14,18 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/memory/detail/meta_cache.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
+#include <mutex>  // NOLINT
+#include <set>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
-#include <mutex>
-#include <set>
-#include <unordered_map>
-#include <vector>
-
 namespace paddle {
 namespace memory {
 namespace detail {
diff --git a/paddle/fluid/memory/detail/memory_block.cc b/paddle/fluid/memory/detail/memory_block.cc
index 07123f2669c3a829ff28e9fab5a404047c5a09c7..f34b922b25a0110690671d487f190e1b977a67bb 100644
--- a/paddle/fluid/memory/detail/memory_block.cc
+++ b/paddle/fluid/memory/detail/memory_block.cc
@@ -13,143 +13,142 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_cache.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
 #include "paddle/fluid/platform/assert.h"
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+void MemoryBlock::init(MetadataCache* cache, Type t, size_t index, size_t size,
                        void* left_buddy, void* right_buddy) {
-  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
-                             static_cast<MemoryBlock*>(left_buddy),
-                             static_cast<MemoryBlock*>(right_buddy)));
+  cache->save(
+      this, MemoryBlock::Desc(t, index, size - sizeof(MemoryBlock::Desc), size,
+                              static_cast<MemoryBlock*>(left_buddy),
+                              static_cast<MemoryBlock*>(right_buddy)));
 }
 
-MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+MemoryBlock::Type MemoryBlock::type(const MetadataCache& cache) const {
   return cache.load(this).type;
 }
 
-size_t MemoryBlock::size(MetadataCache& cache) const {
+size_t MemoryBlock::size(const MetadataCache& cache) const {
   return cache.load(this).size;
 }
 
-size_t MemoryBlock::total_size(MetadataCache& cache) const {
+size_t MemoryBlock::index(const MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+
+size_t MemoryBlock::total_size(const MetadataCache& cache) const {
   return cache.load(this).total_size;
 }
 
-MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+bool MemoryBlock::has_left_buddy(const MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+
+bool MemoryBlock::has_right_buddy(const MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+
+MemoryBlock* MemoryBlock::left_buddy(const MetadataCache& cache) const {
   return cache.load(this).left_buddy;
 }
 
-MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+MemoryBlock* MemoryBlock::right_buddy(const MetadataCache& cache) const {
   return cache.load(this).right_buddy;
 }
 
-void MemoryBlock::split(MetadataCache& cache, size_t size) {
+void MemoryBlock::split(MetadataCache* cache, size_t size) {
   // make sure the split fits
-  PADDLE_ASSERT(total_size(cache) >= size);
+  PADDLE_ASSERT(total_size(*cache) >= size);
 
   // bail out if there is no room for another partition
-  if (total_size(cache) - size <= sizeof(Metadata)) {
+  if (total_size(*cache) - size <= sizeof(MemoryBlock::Desc)) {
     return;
   }
 
   // find the position of the split
   void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
 
-  size_t remaining_size = total_size(cache) - size;
+  size_t remaining_size = total_size(*cache) - size;
 
   // Add the new block as a buddy
-  auto metadata = cache.load(this);
+  auto metadata = cache->load(this);
 
   // Write the metadata for the new block
   auto new_block_right_buddy = metadata.right_buddy;
 
-  cache.store(
-      static_cast<MemoryBlock*>(right_partition),
-      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
-               remaining_size, this, new_block_right_buddy));
+  cache->save(static_cast<MemoryBlock*>(right_partition),
+              MemoryBlock::Desc(FREE_CHUNK, index(*cache),
+                                remaining_size - sizeof(MemoryBlock::Desc),
+                                remaining_size, this, new_block_right_buddy));
 
   metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
-  metadata.size = size - sizeof(Metadata);
+  metadata.size = size - sizeof(MemoryBlock::Desc);
   metadata.total_size = size;
 
-  cache.store(this, metadata);
+  cache->save(this, metadata);
 
   // Write metadata for the new block's right buddy
   if (new_block_right_buddy != nullptr) {
-    auto buddy_metadata = cache.load(new_block_right_buddy);
+    auto buddy_metadata = cache->load(new_block_right_buddy);
 
     buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
 
-    cache.store(new_block_right_buddy, buddy_metadata);
+    cache->save(new_block_right_buddy, buddy_metadata);
   }
 }
 
-void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+void MemoryBlock::merge(MetadataCache* cache, MemoryBlock* right_buddy) {
   // only free blocks can be merged
-  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
-  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(type(*cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(*cache) == FREE_CHUNK);
 
-  auto metadata = cache.load(this);
+  auto metadata = cache->load(this);
 
   // link this->buddy's buddy
-  metadata.right_buddy = right_buddy->right_buddy(cache);
+  metadata.right_buddy = right_buddy->right_buddy(*cache);
 
   // link buddy's buddy -> this
   if (metadata.right_buddy != nullptr) {
-    auto buddy_metadata = cache.load(metadata.right_buddy);
+    auto buddy_metadata = cache->load(metadata.right_buddy);
 
     buddy_metadata.left_buddy = this;
 
-    cache.store(metadata.right_buddy, buddy_metadata);
+    cache->save(metadata.right_buddy, buddy_metadata);
   }
 
-  metadata.size += right_buddy->total_size(cache);
-  metadata.total_size += right_buddy->total_size(cache);
+  metadata.size += right_buddy->total_size(*cache);
+  metadata.total_size += right_buddy->total_size(*cache);
 
-  cache.store(this, metadata);
-  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
+  cache->save(this, metadata);
+  cache->save(right_buddy,
+              MemoryBlock::Desc(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
 }
 
-void MemoryBlock::mark_as_free(MetadataCache& cache) {
+void MemoryBlock::mark_as_free(MetadataCache* cache) {
   // check for double free or corruption
-  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
-  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
-
+  PADDLE_ASSERT(type(*cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(*cache) != INVALID_CHUNK);
   set_type(cache, FREE_CHUNK);
 }
 
-void MemoryBlock::set_type(MetadataCache& cache, Type t) {
-  auto metadata = cache.load(this);
-
+void MemoryBlock::set_type(MetadataCache* cache, Type t) {
+  auto metadata = cache->load(this);
   metadata.type = t;
-
-  cache.store(this, metadata);
-}
-
-bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
-  return left_buddy(cache) != nullptr;
-}
-
-bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
-  return right_buddy(cache) != nullptr;
-}
-
-size_t MemoryBlock::index(MetadataCache& cache) const {
-  return cache.load(this).index;
+  cache->save(this, metadata);
 }
 
 void* MemoryBlock::data() const {
-  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
+  return const_cast<MemoryBlock::Desc*>(
+             reinterpret_cast<const MemoryBlock::Desc*>(this)) +
+         1;
 }
 
 MemoryBlock* MemoryBlock::metadata() const {
   return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
-      reinterpret_cast<const Metadata*>(this) - 1));
+      reinterpret_cast<const MemoryBlock::Desc*>(this) - 1));
 }
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/detail/memory_block.h b/paddle/fluid/memory/detail/memory_block.h
index 72b40b73177d086aa912416e7f9cb3cd4ad5b45e..5cceba659beeec1b3c986dc43229f6725e3e11de 100644
--- a/paddle/fluid/memory/detail/memory_block.h
+++ b/paddle/fluid/memory/detail/memory_block.h
@@ -11,21 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
-#include <cstddef>
+#include <cstdint>
+#include <unordered_map>
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-// Forward Declarations
+// Forward declaration.
 class MetadataCache;
 
-/*! \brief A class used to interpret the contents of a memory block */
-class MemoryBlock {
- public:
+// MemoryBlock represents Each allocated memory block, which contains
+// MemoryBlock::Desc and the payload.
+struct MemoryBlock {
   enum Type {
     FREE_CHUNK,    // memory is free and idle
     ARENA_CHUNK,   // memory is being occupied
@@ -33,57 +33,96 @@ class MemoryBlock {
     INVALID_CHUNK  // memory is invalid
   };
 
- public:
-  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+  // init saves the MemoryBlock::Desc of the memory block in a MetadataCache.
+  // If it is a CPU memory block, the MetadataCache writes the
+  // MemoryBlock::Desc to the beginning of the block; or, if it is a GPU memory
+  // block, the MetadataCache writes the Meatadata to a std::map in
+  // the CPU.
+  void init(MetadataCache* cache, Type t, size_t index, size_t size,
             void* left_buddy, void* right_buddy);
 
- public:
-  /*! \brief The type of the allocation */
-  Type type(MetadataCache& cache) const;
-
-  /*! \brief The size of the data region */
-  size_t size(MetadataCache& cache) const;
+  // All these accessors returns fields in the MemoryBlock::Desc of the memory
+  // block.  They all need a MetadataCache instance as their first
+  // parameter because they read the MemoryBlock::Desc from the cache.
+  Type type(const MetadataCache& cache) const;
+  size_t size(const MetadataCache& cache) const;
+  size_t index(const MetadataCache& cache) const;
+  size_t total_size(const MetadataCache& cache) const;
+  bool has_left_buddy(const MetadataCache& cache) const;
+  bool has_right_buddy(const MetadataCache& cache) const;
+  MemoryBlock* left_buddy(const MetadataCache& cache) const;
+  MemoryBlock* right_buddy(const MetadataCache& cache) const;
 
-  /*! \brief An index to track the allocator */
-  size_t index(MetadataCache& cache) const;
+  // Split the allocation into left/right blocks.
+  void split(MetadataCache* cache, size_t size);
 
-  /*! \brief The total size of the block */
-  size_t total_size(MetadataCache& cache) const;
+  // Merge left and right blocks together.
+  void merge(MetadataCache* cache, MemoryBlock* right_buddy);
 
-  /*! \brief Check the left buddy of the block */
-  bool has_left_buddy(MetadataCache& cache) const;
+  // Mark the allocation as free.
+  void mark_as_free(MetadataCache* cache);
 
-  /*! \brief Check the right buddy of the block */
-  bool has_right_buddy(MetadataCache& cache) const;
-
-  /*! \brief Get the left buddy */
-  MemoryBlock* left_buddy(MetadataCache& cache) const;
-
-  /*! \brief Get the right buddy */
-  MemoryBlock* right_buddy(MetadataCache& cache) const;
-
- public:
-  /*! \brief Split the allocation into left/right blocks */
-  void split(MetadataCache& cache, size_t size);
+  // Change the type of the allocation.
+  void set_type(MetadataCache* cache, Type t);
 
-  /*! \brief Merge left and right blocks together */
-  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
-
-  /*! \brief Mark the allocation as free */
-  void mark_as_free(MetadataCache& cache);
-
-  /*! \brief Change the type of the allocation */
-  void set_type(MetadataCache& cache, Type t);
-
- public:
-  /*! \brief Get a pointer to the memory block's data */
   void* data() const;
-
-  /*! \brief Get a pointer to the memory block's metadata */
   MemoryBlock* metadata() const;
 
+  // MemoryBlock::Desc describes a MemoryBlock.
+  struct Desc {
+    Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+         MemoryBlock* r);
+    Desc();
+
+    // Updates guard_begin and guard_end by hashes of the Metadata object.
+    void update_guards();
+
+    // Checks that guard_begin and guard_end are hashes of the Metadata object.
+    bool check_guards() const;
+
+    // TODO(gangliao): compress this
+    size_t guard_begin = 0;
+    MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK;
+    size_t index = 0;
+    size_t size = 0;
+    size_t total_size = 0;
+    MemoryBlock* left_buddy = nullptr;
+    MemoryBlock* right_buddy = nullptr;
+    size_t guard_end = 0;
+  };
+};
+
+// A cache for accessing memory block meta-data that may be expensive
+// to access directly.  This class exists to unify the
+// MemoryBlock::Desc format between GPU and CPU allocations. It should
+// be removed when the CPU can access all GPU allocations directly via
+// UVM.
+class MetadataCache {
  public:
-  static size_t overhead();
+  explicit MetadataCache(bool uses_gpu);
+
+  // Disable copying and assignment.
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+
+  // Returns the MemoryBlock::Desc for a memory block.  When MetadataCache is
+  // used to manage CPU memory, the MemoryBlock::Desc resides at the beginning
+  // of the memory block; when used to manage GPU memory, the
+  // Meatadata resides in CPU memory indexed by cache_.
+  MemoryBlock::Desc load(const MemoryBlock* memory_block) const;
+
+  // Saves the MemoryBlock::Desc of a memory block into the cache.  For CPU
+  // memory block, writes the MemoryBlock::Desc to the beginning of the memory
+  // block; whereas for GPU memory, writes it to cache_.
+  void save(MemoryBlock* memory_block, const MemoryBlock::Desc& meta_data);
+
+  // For GPU memory block, erases its MemoryBlock::Desc from cache_.
+  void invalidate(MemoryBlock* memory_block);
+
+ private:
+  typedef std::unordered_map<const MemoryBlock*, MemoryBlock::Desc> MetadataMap;
+  MetadataMap cache_;
+  bool uses_gpu_;
 };
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/detail/meta_data.cc b/paddle/fluid/memory/detail/memory_block_desc.cc
similarity index 54%
rename from paddle/fluid/memory/detail/meta_data.cc
rename to paddle/fluid/memory/detail/memory_block_desc.cc
index ad862af1705835c495a30232aa2bba2d2a56ad89..393dd9209c0aa443cd17c29b2f9de6eafb48bac9 100644
--- a/paddle/fluid/memory/detail/meta_data.cc
+++ b/paddle/fluid/memory/detail/memory_block_desc.cc
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/memory/detail/meta_data.h"
-
 #include <functional>
 
+#include "paddle/fluid/memory/detail/memory_block.h"
+
 namespace paddle {
 namespace memory {
 namespace detail {
 
-Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
-                   MemoryBlock* l, MemoryBlock* r)
+MemoryBlock::Desc::Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                        MemoryBlock* l, MemoryBlock* r)
     : type(t),
       index(i),
       size(s),
@@ -29,7 +29,7 @@ Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
       left_buddy(l),
       right_buddy(r) {}
 
-Metadata::Metadata()
+MemoryBlock::Desc::Desc()
     : type(MemoryBlock::INVALID_CHUNK),
       index(0),
       size(0),
@@ -37,32 +37,36 @@ Metadata::Metadata()
       left_buddy(nullptr),
       right_buddy(nullptr) {}
 
+namespace {
+
 template <class T>
-inline void hash_combine(std::size_t& seed, const T& v) {
+inline void hash_combine(std::size_t* seed, const T& v) {
   std::hash<T> hasher;
-  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  (*seed) ^= hasher(v) + 0x9e3779b9 + ((*seed) << 6) + ((*seed) >> 2);
 }
 
-inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+inline size_t hash(const MemoryBlock::Desc& metadata, size_t initial_seed) {
   size_t seed = initial_seed;
 
-  hash_combine(seed, (size_t)metadata->type);
-  hash_combine(seed, metadata->index);
-  hash_combine(seed, metadata->size);
-  hash_combine(seed, metadata->total_size);
-  hash_combine(seed, metadata->left_buddy);
-  hash_combine(seed, metadata->right_buddy);
+  hash_combine(&seed, static_cast<size_t>(metadata.type));
+  hash_combine(&seed, metadata.index);
+  hash_combine(&seed, metadata.size);
+  hash_combine(&seed, metadata.total_size);
+  hash_combine(&seed, metadata.left_buddy);
+  hash_combine(&seed, metadata.right_buddy);
 
   return seed;
 }
 
-void Metadata::update_guards() {
-  guard_begin = hash(this, 1);
-  guard_end = hash(this, 2);
+}  // namespace
+
+void MemoryBlock::Desc::update_guards() {
+  guard_begin = hash(*this, 1);
+  guard_end = hash(*this, 2);
 }
 
-bool Metadata::check_guards() const {
-  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+bool MemoryBlock::Desc::check_guards() const {
+  return guard_begin == hash(*this, 1) && guard_end == hash(*this, 2);
 }
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc
index 43249e842ad4d2419fed041e6c9056021e9663cd..b86e4f38c42a26e155f276f9b73cbed1d0d83f7d 100644
--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/memory/detail/meta_cache.h"
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/platform/assert.h"
@@ -23,29 +22,28 @@ namespace detail {
 
 MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
 
-Metadata MetadataCache::load(const MemoryBlock* block) {
+MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
   if (uses_gpu_) {
-    auto existing_metadata = cache_.find(block);
-    PADDLE_ASSERT(existing_metadata->second.check_guards());
-    return existing_metadata->second;
+    auto existing_desc = cache_.find(block);
+    PADDLE_ASSERT(existing_desc->second.check_guards());
+    return existing_desc->second;
   } else {
-    auto* meta = reinterpret_cast<const Metadata*>(block);
-    VLOG(10) << "Load MetaData type=" << meta->type;
-    PADDLE_ASSERT(meta->check_guards());
-    return *reinterpret_cast<const Metadata*>(block);
+    auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block);
+    VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type;
+    PADDLE_ASSERT(desc->check_guards());
+    return *reinterpret_cast<const MemoryBlock::Desc*>(block);
   }
 }
 
-void MetadataCache::store(MemoryBlock* block,
-                          const Metadata& original_metadata) {
-  auto metadata = original_metadata;
-
-  metadata.update_guards();
+void MetadataCache::save(MemoryBlock* block,
+                         const MemoryBlock::Desc& original_desc) {
+  auto desc = original_desc;
+  desc.update_guards();
 
   if (uses_gpu_) {
-    cache_[block] = metadata;
+    cache_[block] = desc;
   } else {
-    *reinterpret_cast<Metadata*>(block) = metadata;
+    *reinterpret_cast<MemoryBlock::Desc*>(block) = desc;
   }
 }
 
diff --git a/paddle/fluid/memory/detail/meta_cache.h b/paddle/fluid/memory/detail/meta_cache.h
deleted file mode 100644
index 3283d756a6e7f7f1750442039797846bdad51125..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/meta_cache.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
-
-#include <unordered_map>
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-/**
- *  \brief A cache for accessing memory block meta-data that may be expensive
- *         to access directly.
- *
- *  \note  This class exists to unify the metadata format between GPU and CPU
- *         allocations. It should be removed when the CPU can access all GPU
- *         allocations directly via UVM.
- */
-class MetadataCache {
- public:
-  explicit MetadataCache(bool uses_gpu);
-
- public:
-  /*! \brief Load the associated metadata for the specified memory block. */
-  Metadata load(const MemoryBlock* memory_block);
-
-  /*! \brief Store the associated metadata for the specified memory block. */
-  void store(MemoryBlock* memory_block, const Metadata& meta_data);
-
-  /*! \brief Indicate that the specified metadata will no longer be used. */
-  void invalidate(MemoryBlock* memory_block);
-
- public:
-  MetadataCache(const MetadataCache&) = delete;
-  MetadataCache& operator=(const MetadataCache&) = delete;
-
- private:
-  bool uses_gpu_;
-
- private:
-  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
-
- private:
-  MetadataMap cache_;
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/meta_data.h b/paddle/fluid/memory/detail/meta_data.h
deleted file mode 100644
index 14895ee8727e98186b1f1295321951e12753fef6..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/meta_data.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/memory/detail/memory_block.h"
-
-#include <stddef.h>
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-class Metadata {
- public:
-  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
-           MemoryBlock* r);
-  Metadata();
-
- public:
-  /*! \brief Update the guards when metadata is changed */
-  void update_guards();
-
-  /*! \brief Check consistency to previous modification */
-  bool check_guards() const;
-
- public:
-  // TODO(gangliao): compress this
-  // clang-format off
-  size_t            guard_begin = 0;
-  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
-  size_t            index       = 0;
-  size_t            size        = 0;
-  size_t            total_size  = 0;
-  MemoryBlock*      left_buddy  = nullptr;
-  MemoryBlock*      right_buddy = nullptr;
-  size_t            guard_end   = 0;
-  // clang-format on
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 22f6f506748735d1a0fe75375aeea22bd92b8b7e..d5390529163491c2711e50ffad236534e88b73ee 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -13,15 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/detail/system_allocator.h"
-#include "paddle/fluid/platform/assert.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
 #include <algorithm>   // for std::max
 
 #include "gflags/gflags.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
 
 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
@@ -34,13 +35,13 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-void* CPUAllocator::Alloc(size_t& index, size_t size) {
+void* CPUAllocator::Alloc(size_t* index, size_t size) {
   // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
   // malloc might not return nullptr if size is zero, but the returned
   // pointer shall not be dereferenced -- so we make it nullptr.
   if (size <= 0) return nullptr;
 
-  index = 0;  // unlock memory
+  *index = 0;  // unlock memory
 
   void* p;
 
@@ -55,7 +56,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
 
   if (p != nullptr) {
     if (FLAGS_use_pinned_memory) {
-      index = 1;
+      *index = 1;
       mlock(p, size);  // lock memory
     }
   }
@@ -74,7 +75,7 @@ bool CPUAllocator::UseGpu() const { return false; }
 
 #ifdef PADDLE_WITH_CUDA
 
-void* GPUAllocator::Alloc(size_t& index, size_t size) {
+void* GPUAllocator::Alloc(size_t* index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
   if (size <= 0) return nullptr;
@@ -92,7 +93,7 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
   }
 
   if (result == cudaSuccess) {
-    index = 0;
+    *index = 0;
     gpu_alloc_size_ += size;
     return p;
   } else {
@@ -132,23 +133,33 @@ bool GPUAllocator::UseGpu() const { return true; }
 
 // PINNED memory allows direct DMA transfers by the GPU to and from system
 // memory. It’s locked to a physical address.
-void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
+void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
   if (size <= 0) return nullptr;
-  void* p;
-  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+
+  // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
   // of host pinned allocation. Allocates too much would reduce
   // the amount of memory available to the underlying system for paging.
+  size_t usable =
+      paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;
 
-  size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
-
-  if (size > usable) return nullptr;
+  if (size > usable) {
+    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
+                 << " MB pinned memory."
+                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
+    return nullptr;
+  }
 
+  void* p;
   // PINNED memory is visible to all CUDA contexts.
   cudaError_t result = cudaMallocHost(&p, size);
+
   if (result == cudaSuccess) {
-    index = 1;
-    fallback_alloc_size_ += size;
+    *index = 1;  // PINNED memory
+    cuda_pinnd_alloc_size_ += size;
     return p;
+  } else {
+    LOG(WARNING) << "cudaMallocHost failed.";
+    return nullptr;
   }
 
   return nullptr;
@@ -158,8 +169,8 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
   cudaError_t err;
   PADDLE_ASSERT(index == 1);
 
-  PADDLE_ASSERT(fallback_alloc_size_ >= size);
-  fallback_alloc_size_ -= size;
+  PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size);
+  cuda_pinnd_alloc_size_ -= size;
   err = cudaFreeHost(p);
 
   // Purposefully allow cudaErrorCudartUnloading, because
@@ -172,7 +183,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
   }
 }
 
-bool CUDAPinnedAllocator::UseGpu() const { return true; }
+bool CUDAPinnedAllocator::UseGpu() const { return false; }
 
 #endif
 
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index e8479e73f433f1d741b2933da4843c0ba80276d5..a0386a2dad1bb7faf54197a47ca7a5b6d9488817 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -21,21 +21,22 @@ namespace memory {
 namespace detail {
 
 /**
- * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
- *        A BuddyAllocator object uses a SystemAllocator* pointing to the
+ * \brief SystemAllocator is the parent class of CPUAllocator,
+ *        CUDAPinnedAllocator and GPUAllocator. A BuddyAllocator
+ *        object uses a SystemAllocator* pointing to the
  *        underlying system allocator.
  */
 class SystemAllocator {
  public:
   virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void* Alloc(size_t* index, size_t size) = 0;
   virtual void Free(void* p, size_t size, size_t index) = 0;
   virtual bool UseGpu() const = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t& index, size_t size);
+  virtual void* Alloc(size_t* index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
   virtual bool UseGpu() const;
 };
@@ -45,7 +46,7 @@ class GPUAllocator : public SystemAllocator {
  public:
   explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
 
-  virtual void* Alloc(size_t& index, size_t size);
+  virtual void* Alloc(size_t* index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
   virtual bool UseGpu() const;
 
@@ -57,14 +58,12 @@ class GPUAllocator : public SystemAllocator {
 
 class CUDAPinnedAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t& index, size_t size);
+  virtual void* Alloc(size_t* index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
   virtual bool UseGpu() const;
 
  private:
-  size_t gpu_alloc_size_ =
-      0;  // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
-  size_t fallback_alloc_size_ = 0;
+  size_t cuda_pinnd_alloc_size_ = 0;
 };
 #endif
 
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
index 3e1926f632c57b7906e4a76f43ff7a753d71d97f..268260142c579ea9301d89fcec1613ce5b0e15a5 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -22,11 +22,11 @@ limitations under the License. */
 
 DECLARE_bool(use_pinned_memory);
 
-void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
+void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
   bool freed = false;
   {
     size_t index;
-    void* p = a.Alloc(index, size);
+    void* p = a->Alloc(&index, size);
     if (size > 0) {
       EXPECT_NE(p, nullptr);
     } else {
@@ -36,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
     int* i = static_cast<int*>(p);
     std::shared_ptr<int> ptr(i, [&](void* p) {
       freed = true;
-      a.Free(p, size, index);
+      a->Free(p, size, index);
     });
   }
   EXPECT_TRUE(freed);
@@ -45,21 +45,21 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
 TEST(CPUAllocator, NoLockMem) {
   FLAGS_use_pinned_memory = false;
   paddle::memory::detail::CPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 
 TEST(CPUAllocator, LockMem) {
   FLAGS_use_pinned_memory = true;
   paddle::memory::detail::CPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 
 #ifdef PADDLE_WITH_CUDA
 TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a(0);
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 #endif
diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/malloc.cc
similarity index 70%
rename from paddle/fluid/memory/memory.cc
rename to paddle/fluid/memory/malloc.cc
index 56593653a622bce323306d86156d140c46f58d18..0c74f62de5c6f5d432ee928945db6dcf385ca209 100644
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/memory/malloc.h"
 
 #include "glog/logging.h"
 
@@ -38,8 +38,7 @@ BuddyAllocator* GetCPUBuddyAllocator() {
 }
 
 template <>
-void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
-                                bool is_pinned) {
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   void* p = GetCPUBuddyAllocator()->Alloc(size);
   VLOG(10) << "  pointer=" << p;
@@ -47,8 +46,7 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
 }
 
 template <>
-void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
-                              bool is_pinned) {
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
   GetCPUBuddyAllocator()->Free(p);
 }
@@ -84,52 +82,20 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   return as[gpu_id];
 }
 
-BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
-  static BuddyAllocator** as = NULL;
-  if (as == NULL) {
-    int gpu_num = platform::GetCUDADeviceCount();
-    as = new BuddyAllocator*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = nullptr;
-    }
-  }
-  platform::SetDeviceId(gpu_id);
-  if (!as[gpu_id]) {
-    as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
-                                    platform::GpuMinChunkSize(),
-                                    platform::GpuMaxChunkSize());
-    VLOG(10) << "\n\nNOTE: each GPU device use "
-             << FLAGS_fraction_of_gpu_memory_to_use * 100
-             << "% of GPU memory.\n"
-             << "You can set GFlags environment variable '"
-             << "FLAGS_fraction_of_gpu_memory_to_use"
-             << "' to change the fraction of GPU usage.\n\n";
-  }
-  return as[gpu_id];
-}
-
 template <>
 size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
   return GetGPUBuddyAllocator(place.device)->Used();
 }
 
 template <>
-void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
-                                 bool is_pinned) {
-  void* ptr;
-  if (is_pinned) {
-    auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
-    ptr = buddy_allocator->Alloc(size);
-  } else {
-    auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
-    ptr = buddy_allocator->Alloc(size);
-  }
-
+void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
+  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+  auto* ptr = buddy_allocator->Alloc(size);
   if (ptr == nullptr) {
     int cur_dev = platform::GetCurrentDeviceId();
     platform::SetDeviceId(place.device);
     size_t avail, total;
-    platform::GpuMemoryUsage(avail, total);
+    platform::GpuMemoryUsage(&avail, &total);
     LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
                  << place.device << ", available " << avail << " bytes";
     LOG(WARNING) << "total " << total;
@@ -142,15 +108,42 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
 }
 
 template <>
-void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
-                               bool is_pinned) {
-  if (is_pinned) {
-    GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
-  } else {
-    GetGPUBuddyAllocator(place.device)->Free(p);
+void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
+}
+
+BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
+  static BuddyAllocator* ba = NULL;
+  if (ba == NULL) {
+    ba = new BuddyAllocator(new detail::CUDAPinnedAllocator,
+                            platform::CUDAPinnedMinChunkSize(),
+                            platform::CUDAPinnedMaxChunkSize());
   }
+  return ba;
 }
 
+template <>
+size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
+  return GetCUDAPinnedBuddyAllocator()->Used();
+}
+
+template <>
+void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
+                                       size_t size) {
+  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
+  void* ptr = buddy_allocator->Alloc(size);
+
+  if (ptr == nullptr) {
+    LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
+                 << " bytes in CUDAPinnedPlace";
+  }
+  return ptr;
+}
+
+template <>
+void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
+  GetCUDAPinnedBuddyAllocator()->Free(p);
+}
 #endif
 
 size_t Usage::operator()(const platform::CPUPlace& cpu) const {
@@ -165,6 +158,14 @@ size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
 #endif
 }
 
+size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
+#ifdef PADDLE_WITH_CUDA
+  return Used(cuda_pinned);
+#else
+  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
+#endif
+}
+
 size_t memory_usage(const platform::Place& p) {
   return boost::apply_visitor(Usage(), p);
 }
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e6bfddd69cb16edf323d040ea5369cd551f299e
--- /dev/null
+++ b/paddle/fluid/memory/malloc.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+
+/**
+ * \brief   Allocate memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  size   Allocation size.
+ *
+ * \return  Allocated memory block address.
+ *
+ * \note    If return nullptr, it indicates memory allocation failed
+ *          because insufficient memory in current system. When Alloc
+ *          function is invoked, you must check the returned memory
+ *          address is valid or not.
+ */
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  ptr    Memory block address to free.
+ *
+ */
+template <typename Place>
+void Free(Place place, void* ptr);
+
+/**
+ * \brief   Total size of used memory in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ *
+ */
+template <typename Place>
+size_t Used(Place place);
+
+struct Usage : public boost::static_visitor<size_t> {
+  size_t operator()(const platform::CPUPlace& cpu) const;
+  size_t operator()(const platform::CUDAPlace& gpu) const;
+  size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
+};
+
+size_t memory_usage(const platform::Place& p);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PODDeleter {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+
+ public:
+  explicit PODDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
+/**
+ * \brief   Free memory block in one place does not meet POD
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PlainDeleter {
+ public:
+  explicit PlainDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/memory_test.cc b/paddle/fluid/memory/malloc_test.cc
similarity index 68%
rename from paddle/fluid/memory/memory_test.cc
rename to paddle/fluid/memory/malloc_test.cc
index eb27a52b254c1cda065197746eb179bbd1d7f2f1..d39466ef60c3750600dea726a6570397423d42f6 100644
--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/malloc_test.cc
@@ -12,23 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
+#include "paddle/fluid/memory/malloc.h"
+
+#include <unordered_map>
 
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 
-#include <gtest/gtest.h>
-#include <unordered_map>
-
 inline bool is_aligned(void const *p) {
   return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
 }
 
 size_t align(size_t size, paddle::platform::CPUPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
+  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
   size_t alignment = paddle::platform::CpuMinChunkSize();
   size_t remaining = size % alignment;
   return remaining == 0 ? size : size + (alignment - remaining);
@@ -86,7 +85,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
 #ifdef PADDLE_WITH_CUDA
 
 size_t align(size_t size, paddle::platform::CUDAPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
+  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
   size_t alignment = paddle::platform::GpuMinChunkSize();
   size_t remaining = size % alignment;
   return remaining == 0 ? size : size + (alignment - remaining);
@@ -141,4 +140,59 @@ TEST(BuddyAllocator, GPUMultAlloc) {
   }
 }
 
+size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
+  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
+  size_t alignment = paddle::platform::CUDAPinnedMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, CUDAPinnedAllocator) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CUDAPinnedPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::platform::Place place = cpu;
+  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
+
+  paddle::memory::Free(cpu, p);
+}
+
+TEST(BuddyAllocator, CUDAPinnedMultAllocator) {
+  paddle::platform::CUDAPinnedPlace cpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+}
 #endif
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index b991360d0442ec2d258443a931a9dcf10b332f1e..eddcaab8befda84dd14ed46c31ac025dfbcc7ca9 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -56,6 +56,45 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
   }
 }
 
+template <>
+void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
+    platform::CPUPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CPUPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPlace src_place, const void* src, size_t num,
+    cudaStream_t stream) {
+  platform::SetDeviceId(src_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
+    platform::CUDAPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num,
+    cudaStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+}
+
 #endif
 
 }  // namespace memory
diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h
index 062bfc880e78dc5d90c567ffe5c4e521704c9ca6..8d904e3be56abf0974ba7379f7ca1b676fcb0409 100644
--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@@ -14,92 +14,5 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace memory {
-
-/**
- * \brief   Allocate memory block in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- * \param[in]  size   Allocation size.
- *
- * \return  Allocated memory block address.
- *
- * \note    If return nullptr, it indicates memory allocation failed
- *          because insufficient memory in current system. When Alloc
- *          function is invoked, you must check the returned memory
- *          address is valid or not.
- */
-template <typename Place>
-void* Alloc(Place place, size_t size, bool is_pinned = false);
-
-/**
- * \brief   Free memory block in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- * \param[in]  ptr    Memory block address to free.
- *
- */
-template <typename Place>
-void Free(Place place, void* ptr, bool is_pinned = false);
-
-/**
- * \brief   Total size of used memory in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- *
- */
-template <typename Place>
-size_t Used(Place place);
-
-struct Usage : public boost::static_visitor<size_t> {
-  size_t operator()(const platform::CPUPlace& cpu) const;
-  size_t operator()(const platform::CUDAPlace& gpu) const;
-};
-
-size_t memory_usage(const platform::Place& p);
-
-/**
- * \brief   Free memory block in one place.
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *
- */
-template <typename T, typename Place>
-class PODDeleter {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-
- public:
-  explicit PODDeleter(Place place, bool is_pinned = false)
-      : place_(place), is_pinned_(is_pinned) {}
-  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr), is_pinned_); }
-
- private:
-  Place place_;
-  bool is_pinned_;
-};
-
-/**
- * \brief   Free memory block in one place does not meet POD
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *
- */
-template <typename T, typename Place>
-class PlainDeleter {
- public:
-  explicit PlainDeleter(Place place) : place_(place) {}
-  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
-
- private:
-  Place place_;
-};
-
-}  // namespace memory
-}  // namespace paddle
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0d898f59ee1b8c783c5357aa7e27581a993a6d30
--- /dev/null
+++ b/paddle/fluid/memory/pinned_memory_test.cu
@@ -0,0 +1,146 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/memory.h"
+
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+
+// This unit test is an example comparing the performance between using pinned
+// memory and not. In general, using pinned memory will be faster.
+template <typename T>
+__global__ void Kernel(T* output, int dim) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < dim) {
+    output[tid] = output[tid] * output[tid] / 100;
+  }
+}
+
+template <typename Place>
+float test_pinned_memory() {
+  Place cpu_place;
+  paddle::platform::CUDAPlace cuda_place;
+
+  const int data_size = 4096;
+  const int iteration = 10;
+
+  // create event start and end
+  cudaEvent_t start_e, stop_e, copying_e;
+  float elapsedTime = 0;
+  cudaEventCreate(&start_e);
+  cudaEventCreate(&stop_e);
+  cudaEventCreate(&copying_e);
+
+  // create computation stream, data copying stream
+  cudaStream_t computation_stream, copying_stream;
+  cudaStreamCreate(&computation_stream);
+  cudaStreamCreate(&copying_stream);
+
+  // create record event, pinned memory, gpu memory
+  std::vector<cudaEvent_t> record_event(iteration);
+  std::vector<float*> input_pinned_mem(iteration);
+  std::vector<float*> gpu_mem(iteration);
+  std::vector<float*> output_pinned_mem(iteration);
+
+  // initial data
+  for (int j = 0; j < iteration; ++j) {
+    cudaEventCreateWithFlags(&record_event[j], cudaEventDisableTiming);
+    cudaEventCreate(&(record_event[j]));
+    input_pinned_mem[j] = static_cast<float*>(
+        paddle::memory::Alloc(cpu_place, data_size * sizeof(float)));
+    output_pinned_mem[j] = static_cast<float*>(
+        paddle::memory::Alloc(cpu_place, data_size * sizeof(float)));
+    gpu_mem[j] = static_cast<float*>(
+        paddle::memory::Alloc(cuda_place, data_size * sizeof(float)));
+
+    for (int k = 0; k < data_size; ++k) {
+      input_pinned_mem[j][k] = k;
+    }
+  }
+
+  cudaEventRecord(start_e, computation_stream);
+
+  // computation
+  for (int m = 0; m < 30; ++m) {
+    for (int i = 0; i < iteration; ++i) {
+      // cpu -> GPU on computation stream.
+      // note: this operation is async for pinned memory.
+      paddle::memory::Copy(cuda_place, gpu_mem[i], cpu_place,
+                           input_pinned_mem[i], data_size * sizeof(float),
+                           computation_stream);
+
+      // call kernel on computation stream.
+      Kernel<<<4, 1024, 0, computation_stream>>>(gpu_mem[i], data_size);
+
+      // record event_computation on computation stream
+      cudaEventRecord(record_event[i], computation_stream);
+
+      // wait event_computation on copy stream.
+      // note: this operation is async.
+      cudaStreamWaitEvent(copying_stream, record_event[i], 0);
+
+      // copy data GPU->CPU, on copy stream.
+      // note: this operation is async for pinned memory.
+      paddle::memory::Copy(cpu_place, output_pinned_mem[i], cuda_place,
+                           gpu_mem[i], data_size * sizeof(float),
+                           copying_stream);
+    }
+  }
+
+  cudaEventRecord(copying_e, copying_stream);
+  cudaStreamWaitEvent(computation_stream, copying_e, 0);
+
+  cudaEventRecord(stop_e, computation_stream);
+
+  cudaEventSynchronize(start_e);
+  cudaEventSynchronize(stop_e);
+  cudaEventElapsedTime(&elapsedTime, start_e, stop_e);
+
+  // std::cout << cpu_place << " "
+  //          << "time consume:" << elapsedTime / 30 << std::endl;
+
+  for (int l = 0; l < iteration; ++l) {
+    for (int k = 0; k < data_size; ++k) {
+      float temp = input_pinned_mem[l][k];
+      temp = temp * temp / 100;
+      EXPECT_FLOAT_EQ(temp, output_pinned_mem[l][k]);
+    }
+  }
+
+  // destroy resource
+  cudaEventDestroy(copying_e);
+  cudaEventDestroy(start_e);
+  cudaEventDestroy(stop_e);
+  for (int j = 0; j < 10; ++j) {
+    cudaEventDestroy((record_event[j]));
+    paddle::memory::Free(cpu_place, input_pinned_mem[j]);
+    paddle::memory::Free(cpu_place, output_pinned_mem[j]);
+    paddle::memory::Free(cuda_place, gpu_mem[j]);
+  }
+  return elapsedTime / 30;
+}
+
+TEST(CPUANDCUDAPinned, CPUAllocatorAndCUDAPinnedAllocator) {
+  // Generally speaking, operation on pinned_memory is faster than that on
+  // unpinned-memory, but if this unit test fails frequently, please close this
+  // test for the time being.
+  float time1 = test_pinned_memory<paddle::platform::CPUPlace>();
+  float time2 = test_pinned_memory<paddle::platform::CUDAPinnedPlace>();
+  EXPECT_GT(time1, time2);
+}
diff --git a/paddle/fluid/operators/.clang-format b/paddle/fluid/operators/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 9ed79453b962b8702a88cea888a860cd5d8d64d1..5ff987ad8b3ba3c9195e87e6c11e70ac98fa0a11 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -3,8 +3,8 @@ string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
 list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
-set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
-file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
+set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
+file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
 function(op_library TARGET)
     # op_library is a function to create op library. The interface is same as
     # cc_library. But it handle split GPU/CPU code and link some common library
@@ -193,6 +193,7 @@ if(WITH_DISTRIBUTE)
     set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
 else()
     set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
@@ -262,7 +263,7 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
-cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
+cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc
index 6ff363d766db7dd97e1bc193ef7b4a095a7b7c24..ab7c61227114fe7a0ce2ff2515dd560706058b64 100644
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -13,8 +13,8 @@
    limitations under the License. */
 
 #include "mkldnn.hpp"
-#include "mkldnn_activation_op.h"
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/mkldnn_activation_op.h"
 
 namespace paddle {
 namespace operators {
@@ -40,18 +40,24 @@ void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
   const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
 
   // get memory dim
-  PADDLE_ENFORCE(src->dims().size() == 4,
-                 "Input dim must be with 4, i.e. NCHW");
+  PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4,
+                 "Input dim must be with 2 or 4");
   std::vector<int> src_tz = framework::vectorize2int(src->dims());
 
   // create memory description
-  // TODO(kbinias-intel): support more formats
-  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                         mkldnn::memory::format::nchw);
+  auto data_md = src_tz.size() == 2
+                     ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                               mkldnn::memory::format::nc)
+                     : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                               mkldnn::memory::format::nchw);
 
   // create memory primitives
-  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data);
-  auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data);
+  auto src_memory =
+      mkldnn::memory({data_md, mkldnn_engine},
+                     static_cast<void *>(const_cast<float *>(src_data)));
+  auto dst_memory =
+      mkldnn::memory({data_md, mkldnn_engine},
+                     static_cast<void *>(const_cast<float *>(dst_data)));
 
   auto forward_desc = mkldnn::eltwise_forward::desc(
       mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
@@ -91,15 +97,21 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
   std::vector<int> src_tz = framework::vectorize2int(x->dims());
 
   // create memory description
-  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                         mkldnn::memory::format::nchw);
+  auto data_md = src_tz.size() == 2
+                     ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                               mkldnn::memory::format::nc)
+                     : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                               mkldnn::memory::format::nchw);
 
   // create memory primitives
-  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src);
+  auto src_memory = mkldnn::memory(
+      {data_md, mkldnn_engine}, static_cast<void *>(const_cast<float *>(src)));
   auto diff_src_memory =
-      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src);
+      mkldnn::memory({data_md, mkldnn_engine},
+                     static_cast<void *>(const_cast<float *>(diff_src)));
   auto diff_dst_memory =
-      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst);
+      mkldnn::memory({data_md, mkldnn_engine},
+                     static_cast<void *>(const_cast<float *>(diff_dst)));
 
   auto backward_desc =
       mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 7fbe4efc045b6539b498389af94769e5bdb1f82e..c4efbcd3f977ee285e13223d7e0ca420aec63b98 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
diff --git a/paddle/fluid/operators/adagrad_op.cc b/paddle/fluid/operators/adagrad_op.cc
index c990fe784380bf78a7f3594c0f49ef5e06e6caea..0153e1253b00ded21a7a14e37faf5a76d904d8d1 100644
--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/adagrad_op.h"
+#include <vector>
 
 #include <cmath>
 
diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h
index dbcc7abb0996268b5a3571ba113d9cc56f6f65a3..4309f0a5497456065e5c43bc8f7b265fa711f699 100644
--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc
index e8123cb1a490be642d1061bba8129f63e681d3c3..993610fdedde4bafd99f59a0adeeeef4526eb089 100644
--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/assign_value_op.h"
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
index c7b1a55a5cd52bd2bacbdea3ee22c75c2a2c12d5..e749d6f6d3685f207f0ad4f2ebc7c3c7ae32992c 100644
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
index 71de78b1181daf4bd0b6d73508638857bafcf560..a168eaeab56128b75bbe97d7ccf843a081b5dced 100644
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/auc_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h
index f4e8208c3f2e238a4acecab4579fc955092d5978..8b016c3d31ad83e66baeb298c61840cc529efa1e 100644
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -40,7 +42,7 @@ class AucKernel : public framework::OpKernel<T> {
     std::vector<float> thresholds_list;
     thresholds_list.reserve(num_thresholds);
     for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = (float)i / (num_thresholds - 1);
+      thresholds_list[i] = static_cast<float>(i) / (num_thresholds - 1);
     }
     const float kEpsilon = 1e-7;
     thresholds_list[0] = 0.0f - kEpsilon;
@@ -105,11 +107,12 @@ class AucKernel : public framework::OpKernel<T> {
     float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
     float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
     for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] =
-          ((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon);
-      fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon);
-      rec_rate_data[i] =
-          ((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon);
+      tp_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+                        (tp_data[i] + fn_data[i] + epsilon);
+      fp_rate_data[i] =
+          static_cast<float>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
+      rec_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+                         (tp_data[i] + fp_data[i] + epsilon);
     }
     *auc_data = 0.0f;
     if (curve == "ROC") {
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index c95077fcbdb6b6c0da31f30b795dbe4d7d4fe6fe..b21deaf9258567c05a8816b14ac7d6462964e8ba 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -19,15 +19,15 @@ namespace operators {
 
 template <>
 void GetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t& num_updates_,
-    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t* num_updates_,
+    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
   auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
   auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
 
-  old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
-  num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
-  num_updates_ = in_num_updates->data<int64_t>()[0];
+  *old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
+  *num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
+  *num_updates_ = in_num_updates->data<int64_t>()[0];
 }
 
 template <>
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
index 270c46984465e5ca62eaa8da3955ce7a3eaa0c57..046f72b471fa7ffcc82d84262a668c90a7f577a8 100644
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -19,18 +19,18 @@ namespace paddle {
 namespace operators {
 template <>
 void GetAccumulators<paddle::platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t& num_updates_,
-    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t* num_updates_,
+    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
   auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
   auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
   auto stream = ctx.cuda_device_context().stream();
-  memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
+  memory::Copy(platform::CPUPlace(), old_num_accumulates_,
                platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
                sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
+  memory::Copy(platform::CPUPlace(), num_accumulates_, platform::CUDAPlace(),
                in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
+  memory::Copy(platform::CPUPlace(), num_updates_, platform::CUDAPlace(),
                in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
 }
 
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index f858109d1428dc67d94c253e5a39818eb2d4560d..07ac5ced11605f6d0d5164d1c0f69acbd7bbed60 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -29,8 +29,8 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename DeviceContext>
 void GetAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t& num_updates, int64_t& num_accumulates,
-                     int64_t& old_num_accumulates);
+                     int64_t* num_updates, int64_t* num_accumulates,
+                     int64_t* old_num_accumulates);
 
 template <typename DeviceContext>
 void SetAccumulators(const framework::ExecutionContext& ctx,
@@ -47,8 +47,8 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     int64_t num_updates = 0;
     int64_t num_accumulates = 0;
     int64_t old_num_accumulates = 0;
-    GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
-                                   old_num_accumulates);
+    GetAccumulators<DeviceContext>(ctx, &num_updates, &num_accumulates,
+                                   &old_num_accumulates);
 
     // Get attrs
     float average_window = ctx.Attr<float>("average_window");
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index a32aba4c1ff2f5e775aeb41f25b02322dbc6a64a..c70e3cc3c9198008d9eca5f462000aa67ff7e5ba 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -128,10 +128,32 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
         workspace_size_limit, &algo));
+
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Tensor core is supported since the volta GPU and
+    // is only enabled when input and filter data are float16
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      // Currently tensor core is only enabled using this algo
+      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    } else {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+    }
+#endif
+
     // get workspace size able to allocate
     PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, algo, &workspace_size_in_bytes));
+    // It is possible for float16 on Volta GPU to allocate more memory than
+    // the limit because the algo is overrided to use tensor core.
+    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
+                      "workspace_size to be allocated exceeds the limit");
+
     // Allocate on GPU memory
     platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 650bc92be22af9ea8afcacf590a11190109e8811..695db841a4ec666b2c8783dfc7df959711341d85 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_op.h"
+
+#include <string>
+#include <vector>
+
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index 2b19f0448955d2d7582f23ac133c14ffdf5c9e49..719a7465b8d58ef8588ff1e83c2b971eb6fbb00f 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -2,7 +2,8 @@ if(WITH_DISTRIBUTE)
   grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
       grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_test(serde_test SRCS test_serde.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
+  set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
       cares zlib protobuf sendrecvop_grpc)
+  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op)
 endif()
diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.cc b/paddle/fluid/operators/detail/bytebuffer_stream.cc
index 741dd51de9e75feb608161579e56cb160b058ebb..a14171563edb0ac9a22b7ae493c965de3efb7823 100644
--- a/paddle/fluid/operators/detail/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 //       file and did some modifications so that we can send gRPC
 //       requests without too much copying of the tensor data.
 
-#include "bytebuffer_stream.h"
+#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/detail/bytebuffer_stream.h
index 1791a48aab1b66147f645c90757b35ef5f6e001b..054dd4ff294414cca55d7e033f2c5403bbb85526 100644
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.h
@@ -19,9 +19,11 @@ limitations under the License. */
 
 #pragma once
 
-#include <grpc++/grpc++.h>
+#include <vector>
+
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
+#include "grpc++/grpc++.h"
 
 namespace grpc {
 // A ZeroCopyInputStream that reads from grpc_byte_buffer
@@ -56,7 +58,7 @@ class GrpcBufferReader final
       *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
               backup_count_;
       GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
-      *size = (int)backup_count_;
+      *size = static_cast<int>(backup_count_);
       backup_count_ = 0;
       return true;
     }
@@ -68,7 +70,7 @@ class GrpcBufferReader final
     *data = GRPC_SLICE_START_PTR(slice_);
     // On win x64, int is only 32bit
     GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
+    byte_count_ += * size = static_cast<int>(GRPC_SLICE_LENGTH(slice_));
     return true;
   }
 
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 9652bb888b5937390cc183a96ff7ebf5a4fa2426..8bbfd1f15925992efdeaaffbbe7b350ffbcee889 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
+
 #include <sys/time.h>
+
+#include <limits>
+
 #include "paddle/fluid/framework/threadpool.h"
 
 namespace paddle {
@@ -52,7 +56,7 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
     call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   });
 
   req_count_++;
@@ -64,14 +68,13 @@ void ProcGetResponse(const VarHandle& var_h,
                      // const sendrecv::VariableMessage& ret_msg) {
                      const ::grpc::ByteBuffer& ret_msg) {
   framework::Variable* outvar = NULL;
-  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, outvar);
+  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
 }
 
 template <typename T>
 void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
   ::grpc::Slice slice(proto.ByteSizeLong());
-  proto.SerializeWithCachedSizesToArray(
-      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(slice.begin())));
+  proto.SerializeWithCachedSizesToArray(const_cast<uint8_t*>(slice.begin()));
   ::grpc::ByteBuffer tmp(&slice, 1);
   result->Swap(&tmp);
 }
@@ -109,7 +112,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
     call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   });
 
   req_count_++;
@@ -135,7 +138,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
     auto* var = p_scope->FindVar(in_var_name_val);
 
     ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req);
+    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
 
     // var handle
     VarHandle var_h;
@@ -150,9 +153,10 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
     s->response_call_back_ = ProcGetResponse;
 
     auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", req, &cq_);
+        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
+        &cq_);
     call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
   });
 
   req_count_++;
@@ -168,7 +172,7 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
   sendrecv::VariableMessage req;
   req.set_varname(BATCH_BARRIER_MESSAGE);
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
 }
 
@@ -180,7 +184,7 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
   sendrecv::VariableMessage req;
   req.set_varname(FETCH_BARRIER_MESSAGE);
   auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
 }
 
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
index fe237e54ef61fb5b6e9bfa46fbe6b3df3dd40265..4425b19328f503eb7f9022916ed6452cdfea4eeb 100644
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include <grpc++/grpc++.h>
-#include <grpc/support/log.h>
 #include <time.h>
-#include <chrono>
+
+#include <chrono>  // NOLINT
 #include <ctime>
 #include <functional>
 #include <iostream>
@@ -25,11 +24,11 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include <grpc++/generic/generic_stub.h>
-#include <grpc++/grpc++.h>
-#include <grpc++/support/byte_buffer.h>
-#include <grpc++/support/slice.h>
-
+#include "grpc++/generic/generic_stub.h"
+#include "grpc++/grpc++.h"
+#include "grpc++/support/byte_buffer.h"
+#include "grpc++/support/slice.h"
+#include "grpc/support/log.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 109c762e74906440fe1d5091270ac878a054c9f2..d5fc163bc25409e0607b149b61c6266b38119d9d 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/detail/grpc_server.h"
 
+#include <limits>
+#include <string>
+
 using ::grpc::ServerAsyncResponseWriter;
 
 namespace paddle {
@@ -128,6 +131,58 @@ class RequestGet final : public RequestBase {
   SimpleBlockQueue<MessageWithName>* queue_;
 };
 
+class RequestPrefetch final : public RequestBase {
+ public:
+  explicit RequestPrefetch(GrpcService::AsyncService* service,
+                           ::grpc::ServerCompletionQueue* cq,
+                           framework::Scope* scope,
+                           const platform::DeviceContext* dev_ctx,
+                           framework::Executor* executor,
+                           framework::ProgramDesc* program,
+                           framework::ExecutorPrepareContext* prefetch_ctx)
+      : RequestBase(service, cq, dev_ctx),
+        responder_(&ctx_),
+        scope_(scope),
+        executor_(executor),
+        program_(program),
+        prefetch_ctx_(prefetch_ctx) {
+    request_.reset(new VariableResponse(scope, dev_ctx_));
+    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
+    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
+                                cq_, cq_, this);
+  }
+
+  virtual ~RequestPrefetch() {}
+
+  virtual std::string GetReqName() { return request_->Varname(); }
+
+  virtual void Process() {
+    // prefetch process...
+    ::grpc::ByteBuffer reply;
+
+    std::string var_name = request_->OutVarname();
+    auto var_desc = program_->Block(0).FindVar(var_name);
+    framework::Scope* local_scope = &scope_->NewScope();
+    auto* var = local_scope->FindVar(var_name);
+    InitializeVariable(var, var_desc->GetType());
+    executor_->RunPreparedContext(prefetch_ctx_, scope_, false, false);
+
+    SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
+
+    responder_.Finish(reply, ::grpc::Status::OK, this);
+    status_ = FINISH;
+  }
+
+ protected:
+  std::shared_ptr<VariableResponse> request_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+  framework::Scope* scope_;
+  framework::Executor* executor_;
+  framework::ProgramDesc* program_;
+  framework::ExecutorPrepareContext* prefetch_ctx_;
+  int blkid_;
+};
+
 void AsyncGRPCServer::WaitClientGet(int count) {
   int fetch_barriers = 0;
   while (fetch_barriers < count) {
@@ -140,21 +195,26 @@ void AsyncGRPCServer::WaitClientGet(int count) {
 
 void AsyncGRPCServer::RunSyncUpdate() {
   ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials());
+  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
+                           &selected_port_);
   builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
   builder.RegisterService(&service_);
 
   cq_send_ = builder.AddCompletionQueue();
   cq_get_ = builder.AddCompletionQueue();
+  cq_prefetch_ = builder.AddCompletionQueue();
 
   server_ = builder.BuildAndStart();
-  LOG(INFO) << "Server listening on " << address_ << std::endl;
+  LOG(INFO) << "Server listening on " << address_
+            << " selected port: " << selected_port_;
 
   std::function<void()> send_register =
       std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
   std::function<void()> get_register =
       std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
+  std::function<void()> prefetch_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
 
   t_send_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
@@ -163,17 +223,21 @@ void AsyncGRPCServer::RunSyncUpdate() {
   t_get_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
                                 cq_get_.get(), "cq_get", get_register)));
-
+  t_prefetch_.reset(new std::thread(
+      std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
+                "cq_prefetch", prefetch_register)));
   // wait server
   server_->Wait();
   t_send_->join();
   t_get_->join();
+  t_prefetch_->join();
 }
 
 void AsyncGRPCServer::ShutdownQueue() {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   cq_send_->Shutdown();
   cq_get_->Shutdown();
+  cq_prefetch_->Shutdown();
 }
 
 // This URL explains why shutdown is complicate:
@@ -186,6 +250,7 @@ void AsyncGRPCServer::ShutDown() {
 void AsyncGRPCServer::TryToRegisterNewSendOne() {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
     return;
   }
   RequestSend* send = new RequestSend(&service_, cq_send_.get(), scope_,
@@ -196,6 +261,7 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() {
 void AsyncGRPCServer::TryToRegisterNewGetOne() {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
     return;
   }
   RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
@@ -203,33 +269,49 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
   VLOG(4) << "Create RequestGet status:" << get->Status();
 }
 
+void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
+    return;
+  }
+  RequestPrefetch* prefetch =
+      new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_,
+                          executor_, program_, prefetch_ctx_);
+
+  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
+}
+
 // FIXME(typhoonzero): change cq_name to enum.
 void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
-                                    std::string cq_name,
+                                    const std::string& cq_name,
                                     std::function<void()> TryToRegisterNewOne) {
   TryToRegisterNewOne();
 
   void* tag = NULL;
   bool ok = false;
+
   while (true) {
+    VLOG(3) << "HandleRequest for " << cq_name << " while in";
     if (!cq->Next(&tag, &ok)) {
       LOG(INFO) << cq_name << " CompletionQueue shutdown!";
       break;
     }
+    VLOG(3) << "HandleRequest for " << cq_name << " while after Next";
 
     PADDLE_ENFORCE(tag);
     // FIXME(typhoonzero): de-couple the barriers with recv_op
     if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
     if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
 
-    RequestBase* base = (RequestBase*)tag;
+    RequestBase* base = reinterpret_cast<RequestBase*>(tag);
     // reference:
     // https://github.com/tensorflow/tensorflow/issues/5596
     // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
     // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
     if (!ok) {
-      LOG(WARNING) << cq_name << " recv no regular event:argument name"
-                   << base->GetReqName();
+      LOG(WARNING) << cq_name << " recv no regular event:argument name["
+                   << base->GetReqName() << "]";
       TryToRegisterNewOne();
       delete base;
       continue;
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 10e6dd45a901d36de4a6577db4da05551645eb73..b6110f92ed4f38a156e0c99ecfb399f3f47a169e 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -14,10 +14,14 @@ limitations under the License. */
 
 #pragma once
 
-#include <grpc++/grpc++.h>
-#include <thread>
+#include <string>
+#include <thread>  // NOLINT
+#include <utility>
 
+#include "grpc++/grpc++.h"
+#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -53,6 +57,18 @@ class AsyncGRPCServer final {
 
   void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
 
+  void SetProgram(framework::ProgramDesc *program) { program_ = program; }
+
+  void SetPrefetchBlkdId(int blkid) { prefetch_blk_id_ = blkid; }
+
+  void SetExecutor(framework::Executor *executor) { executor_ = executor; }
+
+  void SetPrefetchPreparedCtx(framework::ExecutorPrepareContext *prepared) {
+    prefetch_ctx_ = prepared;
+  }
+
+  int GetSelectedPort() { return selected_port_; }
+
   const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
 
   void Push(const std::string &msg_name) {
@@ -62,10 +78,12 @@ class AsyncGRPCServer final {
   void ShutDown();
 
  protected:
-  void HandleRequest(::grpc::ServerCompletionQueue *cq, std::string cq_name,
+  void HandleRequest(::grpc::ServerCompletionQueue *cq,
+                     const std::string &cq_name,
                      std::function<void()> TryToRegisterNewOne);
   void TryToRegisterNewSendOne();
   void TryToRegisterNewGetOne();
+  void TryToRegisterNewPrefetchOne();
   void ShutdownQueue();
 
  private:
@@ -73,6 +91,7 @@ class AsyncGRPCServer final {
   volatile bool is_shut_down_ = false;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_;
 
   GrpcService::AsyncService service_;
   std::unique_ptr<::grpc::Server> server_;
@@ -83,6 +102,7 @@ class AsyncGRPCServer final {
 
   // received variable from RPC, operators fetch variable from this queue.
   SimpleBlockQueue<MessageWithName> var_get_queue_;
+  // client send variable to this queue.
   ReceivedQueue var_recv_queue_;
 
   // condition of the sub program
@@ -92,6 +112,13 @@ class AsyncGRPCServer final {
 
   std::unique_ptr<std::thread> t_send_;
   std::unique_ptr<std::thread> t_get_;
+  std::unique_ptr<std::thread> t_prefetch_;
+
+  int prefetch_blk_id_;
+  framework::ExecutorPrepareContext *prefetch_ctx_;
+  framework::ProgramDesc *program_;
+  framework::Executor *executor_;
+  int selected_port_;
 };
 
 };  // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c51933718f4ca78e87c77e007c485642000d247d
--- /dev/null
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace detail = paddle::operators::detail;
+
+USE_OP(lookup_table);
+
+std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
+
+framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
+  auto root_block = program->MutableBlock(0);
+  auto* block = program->AppendBlock(*root_block);
+
+  framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
+  framework::VariableNameMap output({{"Output", {"out"}}});
+  auto op = block->AppendOp();
+  op->SetType("lookup_table");
+  op->SetInput("W", {"w"});
+  op->SetInput("Ids", {"ids"});
+  op->SetOutput("Out", {"out"});
+
+  auto& out = *root_block->Var("out");
+  out.SetType(framework::proto::VarType::SELECTED_ROWS);
+  out.SetShape({10, 10});
+
+  return block;
+}
+
+void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
+  auto w_var = scope->Var("w");
+  w_var->GetMutable<framework::SelectedRows>();
+
+  auto out_var = scope->Var("out");
+  out_var->GetMutable<framework::SelectedRows>();
+
+  auto ids_var = scope->Var("ids");
+  ids_var->GetMutable<framework::SelectedRows>();
+}
+
+void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
+  auto rows = ids_var->mutable_rows();
+  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
+  ids_var->mutable_value()->Resize({rows_numel, 1});
+  ids_var->mutable_value()->mutable_data<float>(*place);
+}
+
+void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
+  auto rows = w->mutable_rows();
+  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
+  auto w_value = w->mutable_value();
+  w_value->Resize({rows_numel, 10});
+
+  auto ptr = w_value->mutable_data<float>(*place);
+
+  for (int64_t i = 0; i < w_value->numel(); ++i) {
+    ptr[i] = static_cast<float>(i / 10);
+  }
+}
+
+void StartServer(const std::string& endpoint) {
+  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+  auto* block = AppendPrefetchBlcok(&program);
+  auto prepared = exe.Prepare(program, block->ID());
+  InitTensorsOnServer(&scope, &place, 10);
+
+  rpc_service_->SetProgram(&program);
+  rpc_service_->SetPrefetchPreparedCtx(prepared.get());
+  rpc_service_->SetDevCtx(&ctx);
+  rpc_service_->SetScope(&scope);
+  rpc_service_->SetExecutor(&exe);
+
+  rpc_service_->RunSyncUpdate();
+}
+
+TEST(PREFETCH, CPU) {
+  // start up a server instance backend
+  std::thread server_thread(StartServer, "127.0.0.1:8889");
+  sleep(2);
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  // create var on local scope
+  int64_t rows_numel = 5;
+  InitTensorsOnClient(&scope, &place, rows_numel);
+  std::string in_var_name("ids");
+  std::string out_var_name("out");
+
+  detail::RPCClient client;
+  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
+                               out_var_name);
+  client.Wait();
+
+  auto var = scope.Var(out_var_name);
+  auto value = var->GetMutable<framework::SelectedRows>()->value();
+  auto ptr = value.mutable_data<float>(place);
+
+  rpc_service_->ShutDown();
+  server_thread.join();
+  rpc_service_.reset(nullptr);
+
+  for (int64_t i = 0; i < rows_numel; ++i) {
+    EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+  }
+}
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h
index ae6f9db3bd31a4b4839b34e8e53dd87f1ecf4b1d..e6dab2f5a3a4280f3979417c3ca2d884a0b8ff2f 100644
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -76,10 +76,11 @@ namespace detail {
 enum class GrpcMethod {
   kSendVariable,
   kGetVariable,
+  kPrefetchVariable,
 };
 
 static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kGetVariable) + 1;
+    static_cast<int>(GrpcMethod::kPrefetchVariable) + 1;
 
 inline const char* GrpcMethodName(GrpcMethod id) {
   switch (id) {
@@ -87,6 +88,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/SendVariable";
     case GrpcMethod::kGetVariable:
       return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kPrefetchVariable:
+      return "/sendrecv.SendRecvService/PrefetchVariable";
   }
 
   // Shouldn't be reached.
@@ -114,5 +117,5 @@ class GrpcService final {
 };
 
 }  // namespace detail
-}  // namespace operator
+}  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/proto_encoder_helper.h b/paddle/fluid/operators/detail/proto_encoder_helper.h
index 4a7bfb8bd586fe84c9243bc64117d146c4386674..d91d054b2507f32d1e948dde33da06a70cabe775 100644
--- a/paddle/fluid/operators/detail/proto_encoder_helper.h
+++ b/paddle/fluid/operators/detail/proto_encoder_helper.h
@@ -19,7 +19,9 @@ limitations under the License. */
 
 #pragma once
 
-#include <grpc++/grpc++.h>
+#include <string>
+
+#include "grpc++/grpc++.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -142,6 +144,6 @@ class ProtoEncodeHelper {
   char* limit_;  // Just for CHECKs
 };
 
-}  // detail
-}  // operators
-}  // paddle
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
index 2d33f026e45c51d9a3812b2391381f74d6fddb29..02bb2b9cebb87b83aa1cbef0c644f969b4d17284 100644
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -21,6 +21,8 @@ service SendRecvService {
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
   // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  // pre-fetch variable by given variable name and Ids
+  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
@@ -65,6 +67,8 @@ message VariableMessage {
   bytes serialized = 8;
   // selected_rows data
   bytes rows = 9;
+  // Look up table block execution output variable name.
+  string out_varname = 10;
 }
 
 message VoidMessage {}
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
index 7e3f015dabdb3fd6190d1ca2f422aa526e8889cd..16c612c45a37dd2ffd17f8d5f5946df30e9b3fe6 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+
 #include <sys/time.h>
-#include <thread>
+#include <thread>  // NOLINT
+
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
@@ -28,11 +30,9 @@ namespace detail {
 
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg) {
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_name) {
   using VarMsg = sendrecv::VariableMessage;
-  sendrecv::VariableMessage request;
-  std::string header;
-  request.AppendToString(&header);
   // When using GPU, need to free the copied CPU buffer
   // when the ByteBuffer destroies
   // TODO(typhoonzero): add unref here, if we have dependent
@@ -42,7 +42,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   void* buf = malloc(1024);
   void* payload = nullptr;
   size_t payload_size;
-  ProtoEncodeHelper e((char*)buf, 1024);
+  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
   e.WriteString(VarMsg::kVarnameFieldNumber, name);
   if (var->IsType<framework::LoDTensor>()) {
     e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
@@ -50,6 +50,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
     e.WriteUint64(VarMsg::kTypeFieldNumber, 1);
   }
 
+  if (!out_name.empty()) {
+    e.WriteString(VarMsg::kOutVarnameFieldNumber, out_name);
+  }
   switch (framework::ToVarType(var->Type())) {
     case framework::proto::VarType_Type_LOD_TENSOR: {
       auto tensor = var->Get<framework::LoDTensor>();
@@ -152,7 +155,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
       framework::proto::VarType_Type_SELECTED_ROWS) {
     auto* slr = var->GetMutable<framework::SelectedRows>();
 
-    ProtoEncodeHelper e2((char*)buf, 128);
+    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
     // NOTE: rows is of type int64_t
     size_t rows_memory_size =
         slr->rows().size() * framework::SizeOfType(typeid(int64_t));
@@ -181,10 +184,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const platform::DeviceContext& ctx,
                                const framework::Scope* scope,
-                               framework::Variable*& var) {
+                               framework::Variable** var) {
   operators::detail::VariableResponse resp(scope, &ctx);
   PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
-  var = resp.GetVar();
+  *var = resp.GetVar();
 }
 
 }  // namespace detail
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h
index b3b2b8469c8f19313038f2551ab04708a05656d5..c72e1bd076f670458f3915072154847db6205092 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -46,12 +46,13 @@ typedef void (*DestroyCallback)(void*);
 
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg);
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_varname = std::string());
 
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const platform::DeviceContext& ctx,
                                const framework::Scope* scope,
-                               framework::Variable*& var);
+                               framework::Variable** var);
 
 inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
   switch (type) {
diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/serde_test.cc
similarity index 98%
rename from paddle/fluid/operators/detail/test_serde.cc
rename to paddle/fluid/operators/detail/serde_test.cc
index ea1670e56f3c2fedc2617db1425472e52c6519f5..f8cae6b26acf9d37ca286487065d70ede4c03120 100644
--- a/paddle/fluid/operators/detail/test_serde.cc
+++ b/paddle/fluid/operators/detail/serde_test.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <unistd.h>
 #include <string>
-#include <thread>
+#include <thread>  // NOLINT
 
-#include <google/protobuf/text_format.h>
+#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -107,7 +107,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   for (int i = 0; i < tensor_numel; ++i) {
     EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
   }
-  for (int i = 0; i < rows2->size(); ++i) {
+  for (int64_t i = 0; i < rows2->size(); ++i) {
     EXPECT_EQ(rows_data2[i], i);
   }
   EXPECT_EQ(slr2->height(), 1000);
diff --git a/paddle/fluid/operators/detail/simple_block_queue.h b/paddle/fluid/operators/detail/simple_block_queue.h
index 36b58b0c6700b5af7eaea92d2b0c32adaba35bb8..69773e05df7ed76f31c26f4304693fec2e9aac9c 100644
--- a/paddle/fluid/operators/detail/simple_block_queue.h
+++ b/paddle/fluid/operators/detail/simple_block_queue.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <deque>
-#include <mutex>
+#include <mutex>  // NOLINT
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc
index f59c9b50bb36c12c9abc0a52e0d11c6a73217047..c9d7fd6d1581f6f4182e9e3e0d633c13a3c336a5 100644
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -13,7 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/detail/variable_response.h"
-#include <string.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 
@@ -108,7 +112,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
 
 bool VariableResponse::CopyLodTensorData(
     ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, framework::DDim& dims, int length) {
+    const platform::DeviceContext& ctx, const framework::DDim& dims,
+    int length) {
   auto var = scope_->FindVar(meta_.varname());
   auto* tensor = var->GetMutable<framework::LoDTensor>();
   tensor->Resize(dims);
@@ -144,14 +149,15 @@ inline framework::DDim GetDims(
 
 bool VariableResponse::CopySelectRowsTensorData(
     ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, framework::DDim& dims, int length) {
+    const platform::DeviceContext& ctx, const framework::DDim& dims,
+    int length) {
   auto var = scope_->FindVar(meta_.varname());
   auto* slr = var->GetMutable<framework::SelectedRows>();
   slr->set_height(meta_.slr_height());
   auto* tensor = slr->mutable_value();
   tensor->Resize(dims);
   PADDLE_ENFORCE_EQ(
-      tensor->numel(),
+      static_cast<size_t>(tensor->numel()),
       length / framework::SizeOfType(
                    paddle::operators::detail::ToTypeIndex(meta_.data_type())));
   void* tensor_data = tensor->mutable_data(
@@ -410,6 +416,20 @@ int VariableResponse::Parse(Source* source) {
         }
         break;
       }
+      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_out_varname(temp);
+        break;
+      }
 
       default: {
         // Unknown tag, return unknown error.
diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h
index e121ed7bce966d7dea94f71087f2187dcaa17cec..93b0d3cfb4f7d7f336414361773f872d7b259482 100644
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -53,6 +55,7 @@ class VariableResponse {
   int Parse(const ::grpc::ByteBuffer& byte_buffer);
 
   inline std::string Varname() { return meta_.varname(); }
+  inline std::string OutVarname() { return meta_.out_varname(); }
 
   // should call parse first.
   framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); }
@@ -60,14 +63,14 @@ class VariableResponse {
  private:
   bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
                                 const platform::DeviceContext& ctx,
-                                framework::DDim& dims, int length);
+                                const framework::DDim& dims, int length);
 
   bool CopySelectRowsData(::google::protobuf::io::CodedInputStream* input,
                           const platform::DeviceContext& ctx, int length);
 
   bool CopyLodTensorData(::google::protobuf::io::CodedInputStream* input,
                          const platform::DeviceContext& ctx,
-                         framework::DDim& dims, int length);
+                         const framework::DDim& dims, int length);
 
  private:
   const framework::Scope* scope_;
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index b5ee86ae2d11dfc835e1a3a6826ce016baf38a29..0628b4b826d2730a8e3fb4842e4ae550b8c00569 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -11,9 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include <random>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
index db97ba4f64105c37c49cafbc3fbc4829c5077467..424d273c34b7e8d70c88b591c4fe45db61465f38 100644
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <unistd.h>
+
 #include <string>
-#include <thread>
+#include <thread>  // NOLINT
+#include <vector>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -30,9 +32,9 @@ namespace m = paddle::operators::math;
 
 USE_OP(dropout);
 
-void Compare(f::Scope& scope, p::DeviceContext& ctx) {
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   // init
-  auto var = scope.Var("X");
+  auto var = scope->Var("X");
   auto tensor = var->GetMutable<f::LoDTensor>();
   tensor->Resize({10, 10});
 
@@ -44,12 +46,12 @@ void Compare(f::Scope& scope, p::DeviceContext& ctx) {
   TensorFromVector(init, ctx, tensor);
 
   auto place = ctx.GetPlace();
-  auto out_var = scope.Var("Out");
+  auto out_var = scope->Var("Out");
   auto out_tensor = out_var->GetMutable<f::LoDTensor>();
   out_tensor->Resize({10, 10});
   out_tensor->mutable_data<float>(place);  // allocate
 
-  auto mask_var = scope.Var("Mask");
+  auto mask_var = scope->Var("Mask");
   auto mask_tensor = mask_var->GetMutable<f::LoDTensor>();
   mask_tensor->Resize({10, 10});
   mask_tensor->mutable_data<float>(place);  // allocate
@@ -63,7 +65,7 @@ void Compare(f::Scope& scope, p::DeviceContext& ctx) {
   auto dropout_op = f::OpRegistry::CreateOp(
       "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
 
-  dropout_op->Run(scope, place);
+  dropout_op->Run(*scope, place);
 
   std::vector<float> out_vec;
   TensorToVector(*out_tensor, ctx, &out_vec);
@@ -81,6 +83,11 @@ void Compare(f::Scope& scope, p::DeviceContext& ctx) {
   }
 }
 
+// TODO(wyi): Due to
+// https://github.com/PaddlePaddle/Paddle/issues/9507, I temporarily
+// disable this test to remove the prevention of the merge of
+// unrelated PRs.
+/*
 TEST(Dropout, CPUDense) {
   f::Scope scope;
   p::CPUPlace place;
@@ -94,3 +101,4 @@ TEST(Dropout, GPUDense) {
   p::CUDADeviceContext ctx(place);
   Compare(scope, ctx);
 }
+*/
diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/fc_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..847b7b0c12e1679501dbe83d578b23ca2aef3e9e
--- /dev/null
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -0,0 +1,303 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/fc_op.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+
+template <typename T>
+class MKLDNNMD {
+ public:
+  explicit MKLDNNMD(const T* in, const T* w, bool bias)
+      : in(paddle::framework::vectorize2int(in->dims())),
+        w(paddle::framework::vectorize2int(w->dims())) {
+    with_bias_ = bias;
+  }
+
+  mkldnn::memory::desc dst() const {
+    return platform::MKLDNNMemDesc({in[0], w[1]},
+                                   mkldnn::memory::data_type::f32,
+                                   mkldnn::memory::format::nc);
+  }
+
+  mkldnn::memory::desc src() const {
+    return is_spatial()
+               ? platform::MKLDNNMemDesc({in[0], in[1], in[2], in[3]},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::nchw)
+               : platform::MKLDNNMemDesc({in[0], in[1]},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::nc);
+  }
+
+  mkldnn::memory::desc weights() const {
+    return is_spatial()
+               ? platform::MKLDNNMemDesc({w[1], in[1], in[2], in[3]},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::oihw)
+               : platform::MKLDNNMemDesc({w[1], in[1]},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::oi);
+  }
+
+  mkldnn::memory::desc bias() const {
+    return with_bias_
+               ? platform::MKLDNNMemDesc({w[1]}, mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::format_undef)
+               : platform::MKLDNNMemDesc({}, mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::format_undef);
+  }
+
+ private:
+  bool is_spatial() const { return in.size() > 1 && w.size() > 1; }
+
+  std::vector<int> in;
+  std::vector<int> w;
+  bool with_bias_;
+  bool is_spatial_;
+};
+
+class MKLDNNMemory {
+ public:
+  MKLDNNMemory(MKLDNNMD<Tensor>* t, const mkldnn::engine& e)
+      : md_(t), engine_(e) {}
+  virtual ~MKLDNNMemory() = default;
+
+  template <typename Output>
+  mkldnn::memory dst(const Output* out) {
+    return mkldnn::memory({md_->dst(), engine_},
+                          static_cast<void*>(const_cast<float*>(out)));
+  }
+
+  template <typename Output>
+  mkldnn::memory dst(Output* out) {
+    return mkldnn::memory({md_->dst(), engine_}, out);
+  }
+
+  template <typename Input>
+  mkldnn::memory src(const Input* in) {
+    return mkldnn::memory({md_->src(), engine_},
+                          static_cast<void*>(const_cast<float*>(in)));
+  }
+
+  template <typename Weight>
+  mkldnn::memory weights(const Weight* w) {
+    return mkldnn::memory({md_->weights(), engine_},
+                          static_cast<void*>(const_cast<float*>(w)));
+  }
+
+  mkldnn::memory bias() {
+    return mkldnn::memory(mkldnn::memory::primitive_desc(md_->bias(), engine_));
+  }
+
+ private:
+  MKLDNNMD<Tensor>* md_;
+  const mkldnn::engine& engine_;
+};
+
+template <typename T>
+class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto input = ctx.Input<Tensor>("Input");
+    auto w = ctx.Input<Tensor>("W");
+
+    PADDLE_ENFORCE(input->dims().size() == 2 || input->dims().size() == 4,
+                   "Input must be with 2 or 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(w->dims().size() == 2 || w->dims().size() == 4,
+                   "Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
+
+    bool with_bias = ctx.Attr<bool>("bias_attr");
+    MKLDNNMD<Tensor> md(input, w, with_bias);
+
+    std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> pd =
+        FcFwdPrimitiveDesc(md.src(), md.weights(), md.dst(), md.bias(),
+                           with_bias, mkldnn_engine);
+
+    const std::string key = ctx.op().Output("Out");
+    const std::string key_fc_pd = key + "@fc_pd";
+
+    dev_ctx.SetBlob(key_fc_pd, pd);
+
+    MKLDNNMemory mem(&md, mkldnn_engine);
+
+    const T* input_data = input->data<T>();
+    const T* w_data = w->data<T>();
+
+    auto output = ctx.Output<Tensor>("Out");
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    auto dst_memory = mem.dst(output_data);
+    auto src_memory = mem.src(input_data);
+    auto weights_memory = mem.weights(w_data);
+    auto bias_memory = mem.bias();
+
+    auto forward = with_bias ? mkldnn::inner_product_forward(
+                                   *pd, src_memory, weights_memory, bias_memory,
+                                   dst_memory)
+                             : mkldnn::inner_product_forward(
+                                   *pd, src_memory, weights_memory, dst_memory);
+
+    std::vector<mkldnn::primitive> pipeline = {forward};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+
+ private:
+  std::unique_ptr<mkldnn::inner_product_forward::primitive_desc>
+  FcFwdPrimitiveDesc(const mkldnn::memory::desc& src,
+                     const mkldnn::memory::desc& weights,
+                     const mkldnn::memory::desc& dst,
+                     const mkldnn::memory::desc& bias, const bool with_bias,
+                     const mkldnn::engine& engine) const {
+    auto desc = with_bias
+                    ? mkldnn::inner_product_forward::desc(
+                          mkldnn::prop_kind::forward, src, weights, bias, dst)
+                    : mkldnn::inner_product_forward::desc(
+                          mkldnn::prop_kind::forward, src, weights, dst);
+
+    auto pd = new mkldnn::inner_product_forward::primitive_desc(desc, engine);
+    return std::unique_ptr<mkldnn::inner_product_forward::primitive_desc>(pd);
+  }
+};
+
+template <typename T>
+class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    T* input_grad_data = nullptr;
+    T* w_grad_data = nullptr;
+
+    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
+
+    if (input_grad) {
+      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    if (w_grad) {
+      w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
+    }
+
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const T* input_data = input->data<T>();
+
+    const Tensor* w = ctx.Input<Tensor>("W");
+    const T* w_data = w->data<T>();
+
+    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const T* out_grad_data = out_grad->data<T>();
+
+    bool with_bias = ctx.Attr<bool>("bias_attr");
+
+    MKLDNNMD<Tensor> md(input, w, with_bias);
+    MKLDNNMemory mem(&md, mkldnn_engine);
+
+    auto dst_memory = mem.dst(out_grad_data);
+    auto src_memory = mem.src(input_data);
+    auto weights_memory = mem.weights(w_data);
+    auto bias_memory = mem.bias();
+
+    const std::string key = ctx.op().Input("Out");
+    const std::string key_fc_pd = key + "@fc_pd";
+
+    auto pd =
+        std::static_pointer_cast<mkldnn::inner_product_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_fc_pd));
+
+    PADDLE_ENFORCE(pd != nullptr, "Fail to find key_fc_pd in device context");
+
+    if (w_grad) {
+      auto weights_grad_memory = mem.weights(w_grad_data);
+
+      mkldnn::inner_product_backward_weights::primitive_desc bwd_weight_pd =
+          FcBwdWeightsPrimitiveDesc(md.src(), md.weights(), md.dst(), md.bias(),
+                                    with_bias, *pd, mkldnn_engine);
+
+      auto bwd_weights_prim = mkldnn::inner_product_backward_weights(
+          bwd_weight_pd, src_memory, dst_memory, weights_grad_memory,
+          bias_memory);
+
+      std::vector<mkldnn::primitive> pipeline{bwd_weights_prim};
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    }
+
+    if (input_grad) {
+      auto src_grad_memory = mem.src(input_grad_data);
+
+      mkldnn::inner_product_backward_data::primitive_desc bwd_data_pd =
+          FcBwdDataPrimitiveDesc(md.src(), md.weights(), md.dst(), *pd,
+                                 mkldnn_engine);
+
+      auto bwd_data_prim = mkldnn::inner_product_backward_data(
+          bwd_data_pd, dst_memory, weights_memory, src_grad_memory);
+
+      std::vector<mkldnn::primitive> pipeline{bwd_data_prim};
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    }
+  }
+
+ private:
+  mkldnn::inner_product_backward_weights::primitive_desc
+  FcBwdWeightsPrimitiveDesc(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights,
+      const mkldnn::memory::desc& diff_dst, const mkldnn::memory::desc& bias,
+      const bool with_bias,
+      const mkldnn::inner_product_forward::primitive_desc& pd,
+      const mkldnn::engine& engine) const {
+    auto bwd_weight_desc = with_bias
+                               ? mkldnn::inner_product_backward_weights::desc(
+                                     src, diff_weights, bias, diff_dst)
+                               : mkldnn::inner_product_backward_weights::desc(
+                                     src, diff_weights, bias, diff_dst);
+
+    return mkldnn::inner_product_backward_weights::primitive_desc(
+        bwd_weight_desc, engine, pd);
+  }
+
+  mkldnn::inner_product_backward_data::primitive_desc FcBwdDataPrimitiveDesc(
+      const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights,
+      const mkldnn::memory::desc& diff_dst,
+      const mkldnn::inner_product_forward::primitive_desc& pd,
+      const mkldnn::engine& engine) const {
+    auto bwd_data_desc =
+        mkldnn::inner_product_backward_data::desc(diff_src, weights, diff_dst);
+    return mkldnn::inner_product_backward_data::primitive_desc(bwd_data_desc,
+                                                               engine, pd);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_KERNEL(fc, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::FCMKLDNNOpKernel<float>);
+
+REGISTER_OP_KERNEL(fc_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::FCMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..381771f157d78fb04e54f0a07c40e4df2c91441a
--- /dev/null
+++ b/paddle/fluid/operators/fc_op.cc
@@ -0,0 +1,102 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fc_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+void FCOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "X(Input) of Fully Connected should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "Out(Output) of Fully Connected should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("W"),
+                 "W(Input) of Fully Connected should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto w_dims = ctx->GetInputDim("W");
+  std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
+
+  PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
+                 "Fully Connected input should be 2-D or 4-D tensor.");
+
+  PADDLE_ENFORCE(w_dims.size() == 2 || w_dims.size() == 4,
+                 "Fully Connected input should be 2-D or 4-D tensor.");
+
+  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  ctx->ShareLoD("Input", "Out");
+}
+
+framework::OpKernelType FCOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library{framework::LibraryType::kMKLDNN};
+  framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout, library);
+}
+
+void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto w_dims = ctx->GetInputDim("W");
+
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+  }
+  if (ctx->HasOutput(framework::GradVarName("W"))) {
+    ctx->SetOutputDim(framework::GradVarName("W"), w_dims);
+  }
+}
+
+framework::OpKernelType FCOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library{framework::LibraryType::kMKLDNN};
+  framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout, library);
+}
+
+FCOpMaker::FCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput("Input", "(Tensor) The input tensor of fully connected operator. ");
+  AddInput("W", "(Tensor), The second input tensor of fc op.");
+  AddOutput("Out", "(Tensor) The output tensor of fully connected operator. ");
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("bias_attr", "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddComment(R"DOC(
+  Fully Connected Operator.
+
+  The fully connected operation calculates the output based on the input, weights and bias attribute.
+  The size of each dimension of the parameters checked in the infer-shape.
+  The matrix of bias is generated by the mkldnn framework, when the bias_attr is True.
+  Additional parametrs are use_mkldnn and bias_attr.
+  The input(X) size and output(Out) size may be diffrent.
+
+  The fully connected layer only supports MKLDNN version
+)DOC");
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker, fc_grad,
+            paddle::operators::FCOpGrad);
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..70fa96440d344397a7427c1338afee85bde923d4
--- /dev/null
+++ b/paddle/fluid/operators/fc_op.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FCOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FCOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FCOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FCOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/go_op.cc b/paddle/fluid/operators/go_op.cc
index cfa797717d78aa72e1b931b6db6e153270b3424e..58fe32446217e07235b40b9b78190094e57e4951 100644
--- a/paddle/fluid/operators/go_op.cc
+++ b/paddle/fluid/operators/go_op.cc
@@ -56,11 +56,11 @@ class GoOp : public framework::OperatorBase {
 
     // TODO(varunarora): Consider moving this root scope lookup to scope.h.
     const framework::Scope *root_scope = &scope;
-    const framework::Scope *parent_scope = &(root_scope->parent());
+    const framework::Scope *parent_scope = root_scope->parent();
 
     while (parent_scope != nullptr) {
       root_scope = parent_scope;
-      parent_scope = &(parent_scope->parent());
+      parent_scope = parent_scope->parent();
     }
 
     framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock);
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index c27ea1268321744f6566b8b65e98f0df6d408186..9188f2d989e601b7a97dedaf71f7080829cdb7c3 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -12,29 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <stdint.h>
-#include <sys/stat.h>
 #include <ostream>
 #include <thread>
 
-#include <unistd.h>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/proto_desc.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
-#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/operators/listen_and_serv_op.h"
 
 namespace paddle {
 namespace operators {
 
-constexpr char kOptimizeBlock[] = "OptimizeBlock";
-
 void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
   service->RunSyncUpdate();
   VLOG(4) << "RunServer thread end";
@@ -54,146 +39,159 @@ static void CreateTensorFromMessageType(framework::Variable *var,
   }
 }
 
-static void ParallelExecuteBlocks(const std::vector<size_t> &parallel_blkids,
-                                  framework::Executor *executor,
-                                  framework::ProgramDesc *program,
-                                  framework::Scope *scope) {
+static void ParallelExecuteBlocks(
+    const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
+    const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
+        &prepared,
+    framework::ProgramDesc *program, framework::Scope *scope) {
   std::vector<std::future<void>> fs;
   for (size_t idx : parallel_blkids) {
-    fs.push_back(framework::Async([&executor, &program, &scope, idx]() {
-      int run_block = idx;  // thread local
-      try {
-        executor->Run(*program, scope, run_block, false, false);
-      } catch (std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
-      }
-    }));
+    fs.push_back(
+        framework::Async([&executor, &prepared, &program, &scope, idx]() {
+          int run_block = idx;  // thread local
+          try {
+            executor->RunPreparedContext(prepared[run_block].get(), scope,
+                                         false, false);
+          } catch (std::exception &e) {
+            LOG(ERROR) << "run sub program error " << e.what();
+          }
+        }));
   }
   for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
 }
 
-class ListenAndServOp : public framework::OperatorBase {
- public:
-  ListenAndServOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    if (!rpc_service_) {
-      std::string endpoint = Attr<std::string>("endpoint");
-      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
-      server_thread_.reset(new std::thread(RunServer, rpc_service_));
-    }
-  }
+ListenAndServOp::ListenAndServOp(const std::string &type,
+                                 const framework::VariableNameMap &inputs,
+                                 const framework::VariableNameMap &outputs,
+                                 const framework::AttributeMap &attrs)
+    : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Stop() override {
-    rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
-    server_thread_->join();
-  }
+int ListenAndServOp::GetSelectedPort() {
+  return rpc_service_->GetSelectedPort();
+}
 
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    framework::Scope &recv_scope = scope.NewScope();
-
-    // FIXME(Yancey1989): initialize rpc server with lazy mode.
-    rpc_service_->SetScope(&recv_scope);
-    rpc_service_->SetDevCtx(&dev_ctx);
-    auto ins = Inputs("X");
-    auto fan_in = Attr<int>("Fanin");
-
-    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-    auto *program = block->Program();
-    int num_blocks = program->Size();
-    PADDLE_ENFORCE_GE(num_blocks, 2,
-                      "server program should have at least 2 blocks");
-
-    framework::Executor executor(dev_place);
-
-    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
-    bool exit_flag = false;
-    // Record received sparse variables, so that
-    // we could reset those after execute optimize program
-    std::vector<framework::Variable *> sparse_vars;
-    while (!exit_flag) {
-      // Get from multiple trainers, we don't care about the order in which
-      // the gradients arrives, just add suffix 0~n and merge the gradient.
-      rpc_service_->SetCond(0);
-      size_t recv_var_cnt = 0;
-      int batch_barrier = 0;
-      while (batch_barrier != fan_in) {
-        const detail::ReceivedMessage v = rpc_service_->Get();
-        auto recv_var_name = v.first;
-        if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
-          LOG(INFO) << "received terminate message and exit";
-          exit_flag = true;
-          break;
-        } else if (recv_var_name == BATCH_BARRIER_MESSAGE) {
-          VLOG(3) << "recv batch barrier message";
-          batch_barrier++;
-          continue;
-        } else {
-          VLOG(3) << "received grad: " << recv_var_name;
-          recv_var_cnt++;
-          auto var = v.second->GetVar();
-          if (var == nullptr) {
-            LOG(ERROR) << "Can not find server side var: " << recv_var_name;
-            PADDLE_THROW("Can not find server side var");
-          }
-          if (var->IsType<framework::SelectedRows>()) {
-            sparse_vars.push_back(var);
-          }
-        }
-      }
-      if (exit_flag) {
-        rpc_service_->SetCond(1);
-        rpc_service_->ShutDown();
-        break;
-      }
+void ListenAndServOp::Stop() {
+  rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
+  server_thread_->join();
+}
 
-      // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
-      // and this will still work.
-
-      // The optimize blocks which have the same parent ID would run parallel
-      // TODO(Yancey1989): need to use ParallelExecutor for future
-      size_t last_parent_blkid = program->Block(1).Parent();
-      std::vector<size_t> parallel_blkids;
-      parallel_blkids.push_back(1);
-      double ts = detail::GetTimestamp();
-      for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
-        if (program->Block(blkid).Parent() != last_parent_blkid) {
-          for (size_t idx : parallel_blkids) VLOG(3) << idx;
-          ParallelExecuteBlocks(parallel_blkids, &executor, program,
-                                &recv_scope);
-          parallel_blkids.clear();
-          last_parent_blkid = program->Block(blkid).Parent();
-        }
-        parallel_blkids.push_back(blkid);
-      }
-      ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope);
+void ListenAndServOp::RunImpl(const framework::Scope &scope,
+                              const platform::Place &dev_place) const {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &dev_ctx = *pool.Get(dev_place);
+  framework::Scope &recv_scope = scope.NewScope();
 
-      VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts;
+  if (!rpc_service_) {
+    std::string endpoint = Attr<std::string>("endpoint");
+    rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+  }
 
-      // Reset the received sparse variables, the sum operator would not
-      // sum the input sparse variables which rows is empty at the next
-      // mini-batch.
-      // TODO(Yancey1989): move the reset action into an operator, we couldn't
-      // have any hide logic in the operator.
-      for (auto &var : sparse_vars) {
-        var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+  auto ins = Inputs("X");
+  auto fan_in = Attr<int>("Fanin");
+  auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+  auto *program = block->Program();
+  size_t num_blocks = program->Size();
+  PADDLE_ENFORCE_GE(num_blocks, 2,
+                    "server program should have at least 2 blocks");
+
+  framework::Executor executor(dev_place);
+  std::vector<int> block_list;
+  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
+    block_list.push_back(blkid);
+  }
+  auto prepared = executor.Prepare(*program, block_list);
+  // Insert placeholder for block0 which holds current op itself.
+  prepared.insert(prepared.begin(),
+                  std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
+
+  rpc_service_->SetScope(&recv_scope);
+  rpc_service_->SetDevCtx(&dev_ctx);
+  // TODO(qiao) set proper fields for table lookup and update
+  rpc_service_->SetExecutor(&executor);
+  rpc_service_->SetPrefetchBlkdId(0);
+  rpc_service_->SetProgram(program);
+  // start the server listening after all member initialized.
+  server_thread_.reset(new std::thread(RunServer, rpc_service_));
+  // FIXME(typhoonzero): do we need to wait until the server port is ready?
+  sleep(5);
+
+  // TODO(typhoonzero): change this to a while_op for every cluster-batch.
+  bool exit_flag = false;
+  // Record received sparse variables, so that
+  // we could reset those after execute optimize program
+  std::vector<framework::Variable *> sparse_vars;
+  while (!exit_flag) {
+    // Get from multiple trainers, we don't care about the order in which
+    // the gradients arrives, just add suffix 0~n and merge the gradient.
+    rpc_service_->SetCond(0);
+    size_t recv_var_cnt = 0;
+    int batch_barrier = 0;
+    while (batch_barrier != fan_in) {
+      const detail::ReceivedMessage v = rpc_service_->Get();
+      auto recv_var_name = v.first;
+      if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
+        LOG(INFO) << "received terminate message and exit";
+        exit_flag = true;
+        break;
+      } else if (recv_var_name == BATCH_BARRIER_MESSAGE) {
+        VLOG(3) << "recv batch barrier message";
+        batch_barrier++;
+        continue;
+      } else {
+        VLOG(3) << "received grad: " << recv_var_name;
+        recv_var_cnt++;
+        auto var = v.second->GetVar();
+        if (var == nullptr) {
+          LOG(ERROR) << "Can not find server side var: " << recv_var_name;
+          PADDLE_THROW("Can not find server side var");
+        }
+        if (var->IsType<framework::SelectedRows>()) {
+          sparse_vars.push_back(var);
+        }
       }
+    }
+    if (exit_flag) {
       rpc_service_->SetCond(1);
-      // FIXME(typhoonzero): use another condition to sync wait clients get.
-      rpc_service_->WaitClientGet(fan_in);
-      sparse_vars.clear();
-    }  // while(true)
-  }
+      rpc_service_->ShutDown();
+      break;
+    }
 
- protected:
-  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
-  std::shared_ptr<std::thread> server_thread_;
-};
+    // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
+    // and this will still work.
+
+    // The optimize blocks which have the same parent ID would run parallel
+    // TODO(Yancey1989): need to use ParallelExecutor for future
+    int32_t last_parent_blkid = program->Block(1).Parent();
+    std::vector<size_t> parallel_blkids;
+    parallel_blkids.push_back(1);
+    double ts = detail::GetTimestamp();
+    for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
+      if (program->Block(blkid).Parent() != last_parent_blkid) {
+        ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
+                              &recv_scope);
+        parallel_blkids.clear();
+        last_parent_blkid = program->Block(blkid).Parent();
+      }
+      parallel_blkids.push_back(blkid);
+    }
+    ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
+                          &recv_scope);
+    VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
+
+    // Reset the received sparse variables, the sum operator would not
+    // sum the input sparse variables which rows is empty at the next
+    // mini-batch.
+    // TODO(Yancey1989): move the reset action into an operator, we couldn't
+    // have any hide logic in the operator.
+    for (auto &var : sparse_vars) {
+      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+    }
+    rpc_service_->SetCond(1);
+    // FIXME(typhoonzero): use another condition to sync wait clients get.
+    rpc_service_->WaitClientGet(fan_in);
+    sparse_vars.clear();
+  }  // while(true)
+}
 
 class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0da87afc961e896f04b4f0028bf9b17d5e992548
--- /dev/null
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <ostream>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr char kOptimizeBlock[] = "OptimizeBlock";
+
+void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
+
+class ListenAndServOp : public framework::OperatorBase {
+ public:
+  ListenAndServOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs);
+
+  int GetSelectedPort();
+
+  void Stop() override;
+
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override;
+
+ protected:
+  mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
+  mutable std::shared_ptr<std::thread> server_thread_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index 99f01c2a255ade81421c2bba95ff3d38ced6f87c..bd19d8908e35e51872d324ea5aa6bb02110d5a92 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -35,7 +37,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
       if (lod_t->lod().size() > 0) {
         auto y_lod = lod_t->lod();
         auto last_level = y_lod[y_lod.size() - 1];
-        PADDLE_ENFORCE_EQ(last_level.back(), in->dims()[0],
+        PADDLE_ENFORCE_EQ((int64_t)(last_level.back()), in->dims()[0],
                           "Last value of `Y`'s last level LoD should be equal "
                           "to the first dimension of `X`");
         out->set_lod(y_lod);
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 50eeadab72e71f39325c5eda69e9a3c3e6517d7d..bf33be310686640fa187a07cf46a157b7f433340 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -51,9 +51,8 @@ class LookupTableOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
@@ -84,7 +83,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                      "If the value is -1, it makes no effect to lookup. "
                      "Otherwise the given value indicates padding the output "
                      "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(-1);
+        .SetDefault(kNoPadding);
     AddComment(R"DOC(
 Lookup Table Operator.
 
@@ -124,9 +123,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index c92ce78eeffb8f1517e61c6d6624d406e04d974d..cb088c267bcc028ff11583cd73de5ca1722a9b69 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -25,16 +28,33 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+constexpr int64_t kNoPadding = -1;
 
 template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");
-    auto* ids_var = context.InputVar("Ids");
-    Tensor* output_t = context.Output<Tensor>("Out");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    auto *ids_var = context.InputVar("Ids");
+    Tensor *output_t = context.Output<Tensor>("Out");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+
+    DDim table_dim;
 
-    int64_t* ids;
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW(
+          "The parameter W of a LookupTable "
+          "must be either LoDTensor or SelectedRows");
+    }
+
+    int64_t *ids;
     int64_t ids_numel;
 
     // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
@@ -42,39 +62,50 @@ class LookupTableKernel : public framework::OpKernel<T> {
     // when Ids's type is SelectedRows, the rows of Ids contains the
     // ids to be looked up in W.
     if (ids_var->IsType<LoDTensor>()) {
-      auto* ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t*>(ids_t->data<int64_t>());
+      auto *ids_t = context.Input<LoDTensor>("Ids");
+      ids = const_cast<int64_t *>(ids_t->data<int64_t>());
       ids_numel = ids_t->numel();
     } else if (ids_var->IsType<SelectedRows>()) {
-      auto* ids_t = context.Input<SelectedRows>("Ids");
-      ids = const_cast<int64_t*>(ids_t->rows().data());
+      auto *ids_t = context.Input<SelectedRows>("Ids");
+      ids = const_cast<int64_t *>(ids_t->rows().data());
       ids_numel = ids_t->rows().size();
-      output_t->Resize({ids_numel, table_t->dims()[1]});
+      output_t->Resize({ids_numel, table_dim[1]});
     } else {
       PADDLE_THROW("Unsupported Variable Type of Ids");
     }
 
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    if (table_var->IsType<LoDTensor>()) {
+      auto *table_t = context.Input<LoDTensor>("W");
+      int64_t row_number = table_t->dims()[0];
+      int64_t row_width = table_t->dims()[1];
 
-    int N = table_t->dims()[0];
-    int D = table_t->dims()[1];
-    auto* table = table_t->data<T>();
-    auto* output = output_t->mutable_data<T>(context.GetPlace());
+      auto *table = table_t->data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
 
-    if (padding_idx == -1) {
       for (int64_t i = 0; i < ids_numel; ++i) {
-        PADDLE_ENFORCE_LT(ids[i], N);
-        PADDLE_ENFORCE_GE(ids[i], 0);
-        memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(ids[i], row_number);
+          PADDLE_ENFORCE_GE(ids[i], 0);
+          memcpy(output + i * row_width, table + ids[i] * row_width,
+                 row_width * sizeof(T));
+        }
       }
-    } else {
+    } else if (table_var->IsType<SelectedRows>()) {
+      const auto &table_t = table_var->Get<SelectedRows>();
+      int64_t row_width = table_t.value().dims()[1];
+      const auto *table = table_t.value().data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
       for (int64_t i = 0; i < ids_numel; ++i) {
-        if (ids[i] == padding_idx) {
-          memset(output + i * D, 0, D * sizeof(T));
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
         } else {
-          PADDLE_ENFORCE_LT(ids[i], N);
           PADDLE_ENFORCE_GE(ids[i], 0);
-          memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+          auto id_index = table_t.index(ids[i]);
+          memcpy(output + i * row_width, table + id_index * row_width,
+                 row_width * sizeof(T));
         }
       }
     }
@@ -84,17 +115,29 @@ class LookupTableKernel : public framework::OpKernel<T> {
 template <typename T>
 class LookupTableGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    DDim table_dim;
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW(
+          "The parameter W of a LookupTable "
+          "must be either LoDTensor or SelectedRows");
+    }
+
     bool is_sparse = context.Attr<bool>("is_sparse");
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
       framework::Vector<int64_t> new_rows;
@@ -104,31 +147,30 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       }
       d_table->set_rows(new_rows);
 
-      auto* d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      auto *d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_dim[0], table_dim[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
-      d_table->set_height(table->dims()[0]);
+      d_table->set_height(table_dim[0]);
 
-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table_value->data<T>();
+      auto *d_output_data = d_output->data<T>();
+      auto *d_table_data = d_table_value->data<T>();
 
       PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
       memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
     } else {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
-      auto* table = context.Input<LoDTensor>("W");
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
-      int N = table->dims()[0];
+      int N = table_dim[0];
       int D = d_output->dims()[1];
 
-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table->mutable_data<T>(context.GetPlace());
+      auto *d_output_data = d_output->data<T>();
+      auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
 
       memset(d_table_data, 0, d_table->numel() * sizeof(T));
 
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 299a0aed01dfe0448d896738d9fd33319b1b2887..44fd739fb1d161c6c7d6ab1cc611c59220280a4e 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -322,6 +322,14 @@ void set_constant_with_place<platform::CPUPlace>(
                            TensorSetConstantCPU(tensor, value));
 }
 
+template <>
+void set_constant_with_place<platform::CUDAPinnedPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstantCPU(tensor, value));
+}
+
 struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
   TensorSetConstantWithPlace(const platform::DeviceContext& context,
                              framework::Tensor* tensor, float value)
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 1e909db5288afccb9dd0be08a45cf3c27048ae6f..82e12943148a806bae719c722944d6a9d5236b7c 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -39,18 +39,33 @@ void gemm<platform::CUDADeviceContext, float16>(
   cublasOperation_t cuTransB =
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
-  const half h_alpha = static_cast<const half>(alpha);
-  const half h_beta = static_cast<const half>(beta);
-  const half* h_A = reinterpret_cast<const half*>(A);
-  const half* h_B = reinterpret_cast<const half*>(B);
-  half* h_C = reinterpret_cast<half*>(C);
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
 
   // TODO(kexinzhao): add processing code for compute capability < 53 case
   PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
-                    "cublas Hgemm requires GPU compute capability >= 53");
-  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
-      h_A, lda, &h_beta, h_C, N));
+                    "cublas fp16 gemm requires GPU compute capability >= 53");
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+  if (context.GetComputeCapability() >= 70) {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
+                                                        CUBLAS_TENSOR_OP_MATH));
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  } else {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
+                                                        CUBLAS_DEFAULT_MATH));
+  }
+#endif
+
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
+      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
+      CUDA_R_32F, algo));
 }
 
 template <>
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 5518ebed3f792a5acdfbb27976bc2c6dbd78069a..a579182ec1bd5d10d95bbf8c6f5a0e70ceaaaf4b 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
+#include <vector>
+
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
@@ -95,6 +97,7 @@ template class SoftmaxCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;
 
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 3e123f7bf5512618538fd35aa7e74b82586a5448..dd9971ba091cc3ece86654f65c335b98087f45ed 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -27,7 +27,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T>
 struct ValueClip {
   HOSTDEVICE T operator()(const T& x) const {
-    const T kThreshold = -64.;
+    const T kThreshold = static_cast<T>(-64.);
     return x < kThreshold ? kThreshold : x;
   }
 };
diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
index 90f6f955cea51ded2dbb2bde459113458d7749a4..20b8a5c98ab16ac8121cb2fd01deb8ecc1966d44 100644
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <memory>
-#include <mutex>
-#include <thread>
+#include <mutex>   // NOLINT
+#include <thread>  // NOLINT
 #include <vector>
 
 #include "paddle/fluid/framework/init.h"
@@ -43,7 +43,7 @@ const f::DDim kDims = {20, 20};
 // nccl op common tester, init communicator.
 class NCCLTester : public ::testing::Test {
  public:
-  virtual void SetUp() override {
+  void SetUp() override {
     int count = p::GetCUDADeviceCount();
     if (count <= 1) {
       LOG(WARNING)
@@ -64,7 +64,7 @@ class NCCLTester : public ::testing::Test {
     NCCLInitOp();
   }
 
-  virtual void TearDown() override {
+  void TearDown() override {
     for (auto &device_context : dev_ctxs_) {
       delete device_context;
     }
@@ -137,6 +137,8 @@ class NCCLTester : public ::testing::Test {
 TEST_F(NCCLTester, ncclInitOp) {}
 
 // ncclAllReduceOp with desc
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
+/*
 TEST_F(NCCLTester, ncclAllReduceOp) {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   op2->SetType("ncclAllReduce");
@@ -184,6 +186,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
     }
   }
 }
+*/
 
 // ncclReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {
@@ -236,6 +239,8 @@ TEST_F(NCCLTester, ncclReduceOp) {
 }
 
 // ncclBcastOp with desc
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
+/*
 TEST_F(NCCLTester, ncclBcastOp) {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   const int kRoot = 0;
@@ -281,3 +286,4 @@ TEST_F(NCCLTester, ncclBcastOp) {
     ASSERT_NEAR(ct[j], result, 1e-5);
   }
 }
+*/
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
index c22a55bce263423d5c17fffdb06b7ece02ae26da..82e54139c8c1f42b1d8f74811a6793ec5c66473e 100644
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -73,7 +73,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        platform::CPUPlace());
+        ctx.device_context());
   }
 };
 
@@ -171,6 +171,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker,
                   paddle::framework::EmptyGradOpMaker);
 
-REGISTER_OP_CPU_KERNEL(
-    prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
-    ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(prior_box, ops::PriorBoxOpKernel<float>,
+                       ops::PriorBoxOpKernel<double>);
diff --git a/paddle/fluid/operators/prior_box_op.cu b/paddle/fluid/operators/prior_box_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..76bf2b3b7de7a24c80e927c16199f89c5b7fb794
--- /dev/null
+++ b/paddle/fluid/operators/prior_box_op.cu
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__device__ inline T clip(T in) {
+  return min(max(in, 0.), 1.);
+}
+
+template <typename T>
+__global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
+                            const int width, const int im_height,
+                            const int im_width, const int as_num,
+                            const T offset, const T step_width,
+                            const T step_height, const T* min_sizes,
+                            const T* max_sizes, const int min_num,
+                            bool is_clip) {
+  int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
+  int box_num = height * width * num_priors;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
+       i += blockDim.x * gridDim.x) {
+    int h = i / (num_priors * width);
+    int w = (i / num_priors) % width;
+    int p = i % num_priors;
+    int m = max_sizes ? p / (as_num + 1) : p / as_num;
+    T cx = (w + offset) * step_width;
+    T cy = (h + offset) * step_height;
+    T bw, bh;
+    T min_size = min_sizes[m];
+    if (max_sizes) {
+      int s = p % (as_num + 1);
+      if (s < as_num) {
+        T ar = aspect_ratios[s];
+        bw = min_size * sqrt(ar) / 2.;
+        bh = min_size / sqrt(ar) / 2.;
+      } else {
+        T max_size = max_sizes[m];
+        bw = sqrt(min_size * max_size) / 2.;
+        bh = bw;
+      }
+    } else {
+      int s = p % as_num;
+      T ar = aspect_ratios[s];
+      bw = min_size * sqrt(ar) / 2.;
+      bh = min_size / sqrt(ar) / 2.;
+    }
+    T xmin = (cx - bw) / im_width;
+    T ymin = (cy - bh) / im_height;
+    T xmax = (cx + bw) / im_width;
+    T ymax = (cy + bh) / im_height;
+    out[i * 4] = is_clip ? clip<T>(xmin) : xmin;
+    out[i * 4 + 1] = is_clip ? clip<T>(ymin) : ymin;
+    out[i * 4 + 2] = is_clip ? clip<T>(xmax) : xmax;
+    out[i * 4 + 3] = is_clip ? clip<T>(ymax) : ymax;
+  }
+}
+
+template <typename T>
+__global__ void SetVariance(T* out, const T* var, const int vnum,
+                            const int num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    out[i] = var[i % vnum];
+  }
+}
+
+template <typename T>
+class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
+    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
+    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto flip = ctx.Attr<bool>("flip");
+    auto clip = ctx.Attr<bool>("clip");
+
+    std::vector<float> aspect_ratios;
+    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto im_width = image->dims()[3];
+    auto im_height = image->dims()[2];
+
+    auto width = input->dims()[3];
+    auto height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(im_width) / width;
+      step_height = static_cast<T>(im_height) / height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = aspect_ratios.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      num_priors += max_sizes.size();
+    }
+    int min_num = static_cast<int>(min_sizes.size());
+    int box_num = width * height * num_priors;
+
+    int block = 512;
+    int grid = (box_num + block - 1) / block;
+
+    auto stream =
+        ctx.template device_context<platform::CUDADeviceContext>().stream();
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    framework::Tensor r;
+    framework::TensorFromVector(aspect_ratios, ctx.device_context(), &r);
+
+    framework::Tensor min;
+    framework::TensorFromVector(min_sizes, ctx.device_context(), &min);
+
+    T* max_data = nullptr;
+    framework::Tensor max;
+    if (max_sizes.size() > 0) {
+      framework::TensorFromVector(max_sizes, ctx.device_context(), &max);
+      max_data = max.data<T>();
+    }
+
+    GenPriorBox<T><<<grid, block, 0, stream>>>(
+        boxes->data<T>(), r.data<T>(), height, width, im_height, im_width,
+        aspect_ratios.size(), offset, step_width, step_height, min.data<T>(),
+        max_data, min_num, clip);
+
+    framework::Tensor v;
+    framework::TensorFromVector(variances, ctx.device_context(), &v);
+    grid = (box_num * 4 + block - 1) / block;
+    SetVariance<T><<<grid, block, 0, stream>>>(vars->data<T>(), v.data<T>(),
+                                               variances.size(), box_num * 4);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(prior_box, ops::PriorBoxOpCUDAKernel<float>,
+                        ops::PriorBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h
index 18bb2deb6b5acf626dfb2883a5771d9d195d45c0..1e4a12aac1c5f1c3b7e2e1bc83170de9ad590fc3 100644
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
@@ -51,7 +51,7 @@ struct ClipFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename T>
 class PriorBoxOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -106,49 +106,24 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
         int idx = 0;
         for (size_t s = 0; s < min_sizes.size(); ++s) {
           auto min_size = min_sizes[s];
-          // first prior: aspect_ratio = 1, size = min_size
-          box_width = box_height = min_size / 2.;
-          // xmin
-          e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-          // ymin
-          e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-          // xmax
-          e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-          // ymax
-          e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-
-          idx++;
-          if (max_sizes.size() > 0) {
-            auto max_size = max_sizes[s];
-            // second prior: aspect_ratio = 1,
-            // size = sqrt(min_size * max_size)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
-            // xmin
+          // priors with different aspect ratios
+          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+            float ar = aspect_ratios[r];
+            box_width = min_size * sqrt(ar) / 2.;
+            box_height = min_size / sqrt(ar) / 2.;
             e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            // ymin
             e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            // xmax
             e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            // ymax
             e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
             idx++;
           }
-
-          // rest of priors
-          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
-            float ar = aspect_ratios[r];
-            if (fabs(ar - 1.) < 1e-6) {
-              continue;
-            }
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            // xmin
+          if (max_sizes.size() > 0) {
+            auto max_size = max_sizes[s];
+            // square prior with size sqrt(minSize * maxSize)
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
             e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            // ymin
             e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            // xmax
             e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            // ymax
             e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
             idx++;
           }
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
index 277f2856c07b3fec2113486539aec1d9139fae92..04c5872bef4600e30ba572a025cc5f0a5e9839ca 100644
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -39,10 +39,13 @@ class CreateBatchReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
     out->Reset(
         new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
   }
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 141a3eb93555c32efabc2465dc6daadf41c9d659..ed868786ab2a80efa42574ed4f579c633ce0becf 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <thread>
+#include <thread>  // NOLINT
+
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
@@ -20,12 +21,29 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
-static constexpr size_t kDoubleBufferSize = 2;
+// 'Double buffer' means we shall maintain two batches of input data at the same
+// time. So the kCacheSize shoul be at least 2.
+static constexpr size_t kCacheSize = 2;
+// There will be two bacthes out of the channel during training:
+// 1. the one waiting to be sent to the channel
+// 2. the one just be received from the channel, which is also being used by
+// subsequent operators.
+// So the channel size should be kChacheSize - 2
+static constexpr size_t kChannelSize = 0;  // kCacheSize - 2
 
 class DoubleBufferReader : public framework::DecoratedReader {
  public:
   struct Item {
     Item() : ctx_(nullptr) {}
+    Item(Item&& b) {
+      payloads_ = std::move(b.payloads_);
+      ctx_ = std::move(b.ctx_);
+    }
+    Item& operator=(Item&& b) {
+      payloads_ = std::move(b.payloads_);
+      ctx_ = std::move(b.ctx_);
+      return *this;
+    }
 
     std::vector<framework::LoDTensor> payloads_;
     platform::DeviceContext* ctx_;
@@ -34,42 +52,44 @@ class DoubleBufferReader : public framework::DecoratedReader {
   explicit DoubleBufferReader(
       ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
       : DecoratedReader(reader), place_(target_place) {
-    for (size_t i = 0; i < kDoubleBufferSize; ++i) {
-      if (platform::is_gpu_place(place_)) {
 #ifdef PADDLE_WITH_CUDA
+    for (size_t i = 0; i < kCacheSize; ++i) {
+      if (platform::is_gpu_place(place_)) {
         ctxs_.emplace_back(new platform::CUDADeviceContext(
             boost::get<platform::CUDAPlace>(place_)));
-#endif
       }
     }
-
-    start_thread();
-  }
-
-  void start_thread() {
-    buffer_ = framework::MakeChannel<Item>(kDoubleBufferSize);
-    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
+#endif
+    StartPrefetcher();
   }
 
+  bool HasNext() const override;
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
   void ReInit() override;
 
-  ~DoubleBufferReader() {
-    buffer_->Close();
-    prefetcher_.join();
-    delete buffer_;
+  ~DoubleBufferReader() { EndPrefetcher(); }
+
+ private:
+  void StartPrefetcher() {
+    channel_ = framework::MakeChannel<Item>(kChannelSize);
+    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
   }
 
-  bool HasNext() const override;
+  void EndPrefetcher() {
+    channel_->Close();
+    if (prefetcher_.joinable()) {
+      prefetcher_.join();
+    }
+    delete channel_;
+    channel_ = nullptr;
+  }
 
- private:
   void PrefetchThreadFunc();
 
   std::thread prefetcher_;
-  framework::Channel<Item>* buffer_;
+  framework::Channel<Item>* channel_;
   platform::Place place_;
   std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
-  mutable Item local_buffer_;
 };
 
 class CreateDoubleBufferReaderOp : public framework::OperatorBase {
@@ -79,10 +99,13 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
 
     auto place_str = Attr<std::string>("place");
     platform::Place place;
@@ -123,70 +146,70 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
+bool DoubleBufferReader::HasNext() const {
+  while (!channel_->IsClosed() && !channel_->CanReceive()) {
+  }
+  return channel_->CanReceive();
+}
+
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
   if (!HasNext()) {
     PADDLE_THROW("There is no next data!");
   }
 
-  if (local_buffer_.payloads_.empty()) {
-    buffer_->Receive(&local_buffer_);
-  }
-  *out = local_buffer_.payloads_;
-  local_buffer_.payloads_.clear();
-  if (local_buffer_.ctx_) {
-    local_buffer_.ctx_->Wait();
+  Item batch;
+  channel_->Receive(&batch);
+  *out = batch.payloads_;
+  if (batch.ctx_) {
+    batch.ctx_->Wait();
   }
 }
 
 void DoubleBufferReader::ReInit() {
   reader_->ReInit();
-  buffer_->Close();
-  prefetcher_.join();
-  delete buffer_;
-  start_thread();
+  EndPrefetcher();
+  StartPrefetcher();
 }
 
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
-  size_t gpu_ctx_offset = 0;
+  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
+  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
+  size_t cached_tensor_id = 0;
+
   while (reader_->HasNext()) {
     Item batch;
-    reader_->ReadNext(&batch.payloads_);
+    auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
+    reader_->ReadNext(&cpu_batch);
     if (platform::is_gpu_place(place_)) {
-      std::vector<framework::LoDTensor> gpu_batch;
-      auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++];
-      gpu_ctx_offset %= this->ctxs_.size();
-      gpu_batch.resize(batch.payloads_.size());
-      for (size_t i = 0; i < batch.payloads_.size(); ++i) {
-        framework::TensorCopy(batch.payloads_[i], place_, *gpu_ctx,
-                              &gpu_batch[i]);
-        gpu_batch[i].set_lod(batch.payloads_[i].lod());
+      auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
+      auto* gpu_ctx = ctxs_[cached_tensor_id].get();
+      gpu_batch.resize(cpu_batch.size());
+      for (size_t i = 0; i < cpu_batch.size(); ++i) {
+        framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
+        gpu_batch[i].set_lod(cpu_batch[i].lod());
       }
-      batch.ctx_ = gpu_ctx.get();
-      std::swap(gpu_batch, batch.payloads_);
+      batch.payloads_ = gpu_batch;
+      batch.ctx_ = gpu_ctx;
+    } else {
+      // CPUPlace
+      batch.payloads_ = cpu_batch;
     }
+    ++cached_tensor_id;
+    cached_tensor_id %= kCacheSize;
 
     try {
-      buffer_->Send(&batch);
+      channel_->Send(&batch);
     } catch (paddle::platform::EnforceNotMet e) {
       VLOG(5) << "WARNING: The double buffer channel has been closed. The "
                  "prefetch thread will terminate.";
       break;
     }
   }
-  buffer_->Close();
+  channel_->Close();
   VLOG(5) << "Prefetch thread terminates.";
 }
 
-bool DoubleBufferReader::HasNext() const {
-  if (local_buffer_.payloads_.empty()) {
-    bool ok = buffer_->Receive(&local_buffer_);
-    return ok;
-  } else {
-    return true;
-  }
-}
-
 }  // namespace reader
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
index 47d9989bc8748840ec2d39587fde24355d90b6b4..b72ccc77a3e1ec30fd817471d3ffd667974ae684 100644
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -62,12 +62,15 @@ class CreateMultiPassReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    auto& out = detail::Ref(scope.FindVar(Output("Out")));
     int pass_num = Attr<int>("pass_num");
-    out.GetMutable<framework::ReaderHolder>()->Reset(
-        new MultiPassReader(underlying_reader.Get(), pass_num));
+    out->Reset(new MultiPassReader(underlying_reader.Get(), pass_num));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index 3a1f3805a0483c2f5eabdc7432556051d8308964..b164ce232d6bea7b4ff0c67ee0a7dd83b14f61a2 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -80,10 +80,14 @@ class CreateShuffleReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    auto& var = detail::Ref(scope.FindVar(Output("Out")));
-    var.GetMutable<framework::ReaderHolder>()->Reset(
+    out->Reset(
         new ShuffleReader(underlying_reader.Get(),
                           static_cast<size_t>(Attr<int>("buffer_size"))));
   }
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index b6ac7b21d56f7760b3f4814581c90b0ff2cc4a6a..eacedeea8835d27b712b287824b9d30b03ebebbf 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -21,6 +21,22 @@ namespace reader {
 
 class MultipleReader : public framework::ReaderBase {
  public:
+  class ThreadBufferMap {
+   public:
+    std::vector<framework::LoDTensor>& operator[](
+        const std::thread::id& thread_id) {
+      std::lock_guard<std::mutex> lock(mutex_);
+      return buffer_[thread_id];
+    }
+
+    void Clear() { buffer_.clear(); }
+
+   private:
+    std::mutex mutex_;
+    std::unordered_map<std::thread::id, std::vector<framework::LoDTensor>>
+        buffer_;
+  };
+
   MultipleReader(const std::vector<std::string>& file_names,
                  const std::vector<framework::DDim>& dims, size_t thread_num)
       : file_names_(file_names), dims_(dims) {
@@ -47,28 +63,27 @@ class MultipleReader : public framework::ReaderBase {
   framework::Channel<size_t>* waiting_file_idx_;
   framework::Channel<size_t>* available_thread_idx_;
   framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
-  mutable std::vector<framework::LoDTensor> local_buffer_;
+  mutable ThreadBufferMap thread_buffer_map_;
 };
 
 void MultipleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
   if (!HasNext()) {
     PADDLE_THROW("There is no next data!");
   }
-
-  if (local_buffer_.empty()) {
-    buffer_->Receive(&local_buffer_);
-  }
-  *out = local_buffer_;
-  local_buffer_.clear();
+  auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
+  *out = thread_local_buffer;
+  thread_local_buffer.clear();
 }
 
 bool MultipleReader::HasNext() const {
-  return local_buffer_.empty() ? buffer_->Receive(&local_buffer_) : true;
+  auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
+  return thread_local_buffer.empty() ? buffer_->Receive(&thread_local_buffer)
+                                     : true;
 }
 
 void MultipleReader::ReInit() {
   EndScheduler();
-  local_buffer_.clear();
+  thread_buffer_map_.Clear();
   StartNewScheduler();
 }
 
@@ -176,7 +191,7 @@ class OpenFilesOp : public framework::OperatorBase {
     const auto& ranks = Attr<std::vector<int>>("ranks");
     PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
     PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
+                      static_cast<int>(shape_concat.size()),
                       "The accumulate of all ranks should be equal to the "
                       "shape concat's length.");
     const auto& file_names = Attr<std::vector<std::string>>("file_names");
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 832509641cc3d5178ff090e05437484d395bfe51..93f9c74b809770136d3d3300e0e0700b1bc0459e 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -14,93 +14,72 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/reshape_op.h"
 
+#include <string>
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
-class ReshapeOp : public framework::OperatorWithKernel {
- public:
-  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReshapeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReshapeOp should not be null.");
-
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
-    auto x_dims = ctx->GetInputDim("X");
-
-    std::vector<size_t> neg_dims_idx;
-    // set some dimension to -1 if it is unknown
-    const int unknown_size = -1;
-    for (size_t i = 0; i < shape.size(); ++i) {
-      PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
-                     "Each dimension of Attr(shape) must be positive or %d.",
-                     unknown_size);
-      if (shape[i] == unknown_size) {
-        neg_dims_idx.push_back(i);
-        PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
-                       "Only one dimension of Attr(shape) can be unknown.");
-      }
-    }
-
-    int64_t capacity =
-        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
-    int64_t in_size = framework::product(x_dims);
-    if (neg_dims_idx.size() == 1) {
-      // dim infer
-      shape[neg_dims_idx[0]] = in_size / (-capacity);
-      // recalculate capacity
-      capacity = shape[neg_dims_idx[0]] * (-capacity);
-    }
-    // capacity check
-    PADDLE_ENFORCE(capacity == in_size,
-                   "The size of Input(X) mismatches with Attr(shape).");
-    // resize output
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto out_dims = framework::make_ddim(shape_int64);
-    ctx->SetOutputDim("Out", out_dims);
-    if (shape[0] == x_dims[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-};
-
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of reshape operator.");
-    AddOutput("Out", "The output tensor of reshape operator.");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) "
-                              "Target shape of reshape operator.");
+    AddInput("X", "(Tensor). The input tensor of reshape operator.");
+    AddInput("Shape",
+             "(Tensor<int32>, optional). If provided, reshape according to "
+             "this given shape. That is to say it has a higher priority than "
+             "the shape attribute, while the shape attribute still should be "
+             "set correctly to gurantee shape inference in compile time.")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
+    AddAttr<std::vector<int>>(
+        "shape", "(std::vector<int>) Target shape of reshape operator.");
     AddAttr<bool>("inplace",
-                  "Change the source tensor's shape without copy memory.")
-        .SetDefault(true);
+                  "(default: false) Change the source tensor's shape without "
+                  "memory copy. When Attr(inplace) is set true, the output "
+                  "tensor shares memory with Input(X), otherwise, a new output "
+                  "tensor is created, and its data are copied from Input(x).")
+        .SetDefault(false);
     AddComment(R"DOC(
 Reshape Operator.
 
-Reshape Input(X) into the shape specified by Attr(shape).
+Reshape Input(X) into the shape specified by Attr(shape) or Input(Shape). The
+data in Input(X) are unchanged.
 
-An example:
-Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
+Examples:
 
-and target shape = [1, 4], the reshape operator will transform
-the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
+1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [6, 8], the reshape operator will transform Input(X)
+into a 2-D tensor with shape [6, 8] and leaving Input(X)'s data unchanged.
+
+2. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 3, 4, 2] and leaving Input(X)'s data
+unchanged. In this case, one and only dimension of Attr(shape) can be set to -1,
+the value of this dimension is inferred from the total element number of
+Input(X) and remaining dimensions.
+
+3. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 4, 3, 2] and leaving Input(X)'s data
+unchanged. In this case, besides -1, 0 means the actual dimension value is going
+to be copied from the corresponding dimension of Input(X).
+
+Note:
+
+1. One and only one dimension in Attr(shape) can be set -1. In this case,
+the actual dimension value will be infered from the total element number of
+Input(X) and remaining dimensions.
+
+2. More than one dimensions in Attr(shape) can be set to 0, which means the real
+dimension value will be copied from Input(X) at runtime. Note that the index of
+0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape
+[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
+
+3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
+Attr(shape) still should be set correctly to gurantee shape inference in 
+compile-time.
 
-One dimension in the target shape can be set -1, representing that its
-size is unknown. In this case, the real dimension will be infered from 
-the original shape of Input(X) and other dimensions in the target shape.
 )DOC");
   }
 };
@@ -119,6 +98,14 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
                    "Input(Out@GRAD) shouldn't be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index eacb0a0cf21a60ffbdef5787434859ac549388bc..807e5ad951b893a4c027a96d743f0606b70cf160 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -14,23 +14,138 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+class ReshapeOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReshapeOp should not be null.");
+
+    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(!shape.empty(),
+                   "The shape information must be set by Attr(shape).");
+
+    if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
+      // If true, set the shape of Output(Out) according to Input(Shape) in
+      // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
+      ctx->ShareLoD("X", /*->*/ "Out");
+      return;
+    }
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ValidateShape(shape, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+
+  static framework::DDim ValidateShape(const std::vector<int> shape,
+                                       const framework::DDim &in_dims) {
+    const int64_t in_size = framework::product(in_dims);
+    // only one dimension canbe set to -1, whose size will be automatically
+    // infered.
+    const int64_t unk_dim_val = -1;
+    const int64_t copy_dim_val = 0;
+
+    std::vector<int64_t> output_shape(shape.size(), 0);
+    int64_t capacity = 1;
+    int unk_dim_idx = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (shape[i] == unk_dim_val) {
+        PADDLE_ENFORCE(
+            unk_dim_idx == -1,
+            "Only one input dimension of Attr(shape) can be unknown.");
+        unk_dim_idx = i;
+      } else if (shape[i] == copy_dim_val) {
+        PADDLE_ENFORCE(
+            static_cast<int>(i) < in_dims.size(),
+            "The index of dimension to copy from input shape must be less "
+            "than the size of input shape.");
+      } else {
+        PADDLE_ENFORCE(
+            shape[i] > 0,
+            "Each input dimension of Attr(shape) must not be negtive except "
+            "one unknown dimension.");
+      }
+
+      capacity *= (shape[i] ? shape[i] : in_dims[i]);
+      output_shape[i] =
+          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
+    }
+
+    if (unk_dim_idx != -1) {
+      output_shape[unk_dim_idx] = -in_size / capacity;
+      PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
+                        "Invalid shape is given.");
+    } else {
+      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
+    }
+    return framework::make_ddim(output_shape);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
 template <typename DeviceContext, typename T>
 class ReshapeKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* in = ctx.Input<framework::Tensor>("X");
+  void Compute(const framework::ExecutionContext &ctx) const {
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *in = ctx.Input<framework::LoDTensor>("X");
+    auto *shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
+
+    framework::DDim out_dims = out->dims();
+    if (shape_tensor) {
+      auto *shape_data = shape_tensor->data<int>();
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        framework::Tensor cpu_shape_tensor;
+        TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(),
+                   &cpu_shape_tensor);
+        shape_data = cpu_shape_tensor.data<int>();
+      }
+      auto shape =
+          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
+      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
+    }
+    if (!in->lod().empty()) {
+      PADDLE_ENFORCE_EQ(
+          out_dims[0], in->dims()[0],
+          "Reshape operator cannot reshape an input sequence batch "
+          "into an output sequence batch that has a different "
+          "number of time steps. Please consider using "
+          "sequence_reshape op.");
+    }
+
     bool inplace = ctx.Attr<bool>("inplace");
-    auto out_dims = out->dims();
+    out->Resize(out_dims);
     if (!inplace) {
       out->mutable_data<T>(ctx.GetPlace());
       framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+      // TensorCopy will resize to in_dims.
       out->Resize(out_dims);
     } else {
       out->ShareDataWith(*in);
@@ -42,9 +157,10 @@ class ReshapeKernel : public framework::OpKernel<T> {
 template <typename DeviceContext, typename T>
 class ReshapeGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+  void Compute(const framework::ExecutionContext &ctx) const {
+    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
     d_x->mutable_data<T>(ctx.GetPlace());
     bool inplace = ctx.Attr<bool>("inplace");
 
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index 04392b3e05fa2d8b602946ba03672bf2491dcfbc..542bc3fde2a3616807eea560be85fb42026d5825 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/string/printf.h"
@@ -34,6 +35,7 @@ namespace m = paddle::operators::math;
 
 // global for simplicity.
 std::unique_ptr<f::OperatorBase> listen_and_serv_op;
+int selected_port;
 
 void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
   p::CPUDeviceContext ctx(place);
@@ -128,14 +130,16 @@ void StartServerNet(bool is_sparse) {
   AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
 
   f::AttributeMap attrs;
-  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"endpoint", std::string("127.0.0.1:0")});
   attrs.insert({"Fanin", 1});
   attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
   attrs.insert({"GradList", std::vector<std::string>({"x1"})});
   attrs.insert({"OptimizeBlock", optimize_block});
   listen_and_serv_op =
       f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
+  LOG(INFO) << "selected port before run " << selected_port;
   listen_and_serv_op->Run(scope, place);
+  LOG(INFO) << "server exit";
 }
 
 TEST(SendRecvOp, CPUDense) {
@@ -149,12 +153,19 @@ TEST(SendRecvOp, CPUDense) {
   scope.Var("RPC_CLIENT_VAR");
 
   f::AttributeMap attrs;
-  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
-  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
+  selected_port = static_cast<paddle::operators::ListenAndServOp *>(
+                      listen_and_serv_op.get())
+                      ->GetSelectedPort();
+  LOG(INFO) << "selected port " << selected_port;
+  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
+  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
+  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
   auto send_op = f::OpRegistry::CreateOp(
       "send", {{"X", {"x1"}}},
       {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
+  LOG(INFO) << "before run " << endpoint;
   send_op->Run(scope, place);
+  LOG(INFO) << "end run";
 
   auto in_var = scope.Var("x1");
   auto tensor = in_var->GetMutable<f::LoDTensor>();
@@ -167,6 +178,7 @@ TEST(SendRecvOp, CPUDense) {
   for (int64_t i = 0; i < target->numel(); ++i) {
     EXPECT_EQ(expected[i] * 2, actual[i]);
   }
+  LOG(INFO) << "before stop";
   listen_and_serv_op->Stop();
   server_thread.join();
   listen_and_serv_op.reset(nullptr);
@@ -182,8 +194,13 @@ TEST(SendRecvOp, CPUSparse) {
   InitSelectedRowsInScope(scope, place);
   scope.Var("RPC_CLIENT_VAR");
   f::AttributeMap attrs;
-  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
-  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
+  selected_port = static_cast<paddle::operators::ListenAndServOp *>(
+                      listen_and_serv_op.get())
+                      ->GetSelectedPort();
+  LOG(INFO) << "selected port " << selected_port;
+  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
+  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
+  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
   auto send_op = f::OpRegistry::CreateOp(
       "send", {{"X", {"x1"}}},
       {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
index d0aa2f9cbadaadf4e7e625628d9db5677d50d277..074fa9e00f2ec531f324ff10113d95144687d500 100644
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -43,9 +43,8 @@ class SGDOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Param")->type()),
-        ctx.GetPlace());
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
+    return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
@@ -53,10 +52,12 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Param", "(Tensor or SelectedRows) Input parameter");
     AddInput("LearningRate", "(Tensor) Learning rate of SGD");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddInput("Grad", "(Tensor or SelectedRows) Input gradient");
+    AddOutput("ParamOut",
+              "(Tensor or SelectedRows, same with Param) "
+              "Output parameter, should share the same memory with Param");
     AddComment(R"DOC(
 
 SGD operator
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index 0ad801079400f1830d85a945e57a434a86adeb00..8d2bdf75903b4958e14605781f65c5a214cb5300 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -23,60 +23,97 @@ namespace operators {
 template <typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param = ctx.Input<framework::Tensor>("Param");
-    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    auto* grad_var = ctx.InputVar("Grad");
-    // Actually, all tensors are LoDTensor except SelectedRows.
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      param_out->mutable_data<T>(ctx.GetPlace());
-      auto* grad = ctx.Input<framework::Tensor>("Grad");
-
-      auto p = framework::EigenVector<T>::Flatten(*param);
-      auto g = framework::EigenVector<T>::Flatten(*grad);
-      auto o = framework::EigenVector<T>::Flatten(*param_out);
-      auto* lr = learning_rate->data<T>();
-
-      o = p - lr[0] * g;
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
-      // This manual optimization brings difficulty to track data dependency.
-      // It's better to find a more elegant solution.
-      PADDLE_ENFORCE_EQ(param, param_out);
-      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    const auto *param_var = ctx.InputVar("Param");
+    const auto *grad_var = ctx.InputVar("Grad");
+
+    if (param_var->IsType<framework::LoDTensor>()) {
+      const auto *param = ctx.Input<framework::Tensor>("Param");
+      auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+
+      // Actually, all tensors are LoDTensor except SelectedRows.
+      if (grad_var->IsType<framework::LoDTensor>()) {
+        param_out->mutable_data<T>(ctx.GetPlace());
+        const auto *grad = ctx.Input<framework::Tensor>("Grad");
+
+        auto p = framework::EigenVector<T>::Flatten(*param);
+        auto g = framework::EigenVector<T>::Flatten(*grad);
+        auto o = framework::EigenVector<T>::Flatten(*param_out);
+        auto *lr = learning_rate->data<T>();
+
+        o = p - lr[0] * g;
+      } else if (grad_var->IsType<framework::SelectedRows>()) {
+        // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+        // This manual optimization brings difficulty to track data dependency.
+        // It's better to find a more elegant solution.
+        PADDLE_ENFORCE_EQ(param, param_out);
+        const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+
+        // for distributed training, a sparse var may be empty,
+        // just skip updating.
+        if (grad->rows().size() == 0) {
+          return;
+        }
+
+        auto grad_height = grad->height();
+        auto out_dims = param_out->dims();
+        PADDLE_ENFORCE_EQ(grad_height, out_dims[0]);
+
+        auto &grad_value = grad->value();
+        auto &grad_rows = grad->rows();
+
+        size_t grad_row_numel = grad_value.numel() / grad_rows.size();
+        PADDLE_ENFORCE_EQ(grad_row_numel, param_out->numel() / grad_height);
+
+        auto *grad_data = grad_value.data<T>();
+        auto *out_data = param_out->data<T>();
+        auto *lr = learning_rate->data<T>();
+        for (size_t i = 0; i < grad_rows.size(); i++) {
+          PADDLE_ENFORCE(grad_rows[i] < grad_height,
+                         "Input rows index should less than height");
+          for (int64_t j = 0; j < grad_row_numel; j++) {
+            out_data[grad_rows[i] * grad_row_numel + j] -=
+                lr[0] * grad_data[i * grad_row_numel + j];
+          }
+        }
+      } else {
+        PADDLE_THROW("Unsupported Variable Type of Grad");
+      }
+    } else if (param_var->IsType<framework::SelectedRows>()) {
+      PADDLE_ENFORCE(grad_var->IsType<framework::SelectedRows>(),
+                     "when param "
+                     "is SelectedRows, gradient should also be SelectedRows");
+      const auto &param = param_var->Get<framework::SelectedRows>();
+      auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
+      const auto &grad = grad_var->Get<framework::SelectedRows>();
 
       // for distributed training, a sparse var may be empty,
       // just skip updating.
-      if (grad->rows().size() == 0) {
+      if (grad.rows().size() == 0) {
         return;
       }
 
-      auto in_height = grad->height();
-      auto out_dims = param_out->dims();
-      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
-
-      auto& in_value = grad->value();
-      auto& in_rows = grad->rows();
+      size_t param_row_width = param.value().numel() / param.rows().size();
+      size_t grad_row_width = grad.value().numel() / grad.rows().size();
+      PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
+                        "param_row should have the same size with grad_row");
 
-      int64_t in_row_numel = in_value.numel() / in_rows.size();
-      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
-
-      auto* in_data = in_value.data<T>();
-      auto* out_data = param_out->data<T>();
-      auto* lr = learning_rate->data<T>();
-      for (size_t i = 0; i < in_rows.size(); i++) {
-        PADDLE_ENFORCE(in_rows[i] < in_height,
+      const auto *lr = learning_rate->data<T>();
+      const auto *grad_data = grad.value().data<T>();
+      auto *out_data = param_out->mutable_value()->data<T>();
+      for (size_t i = 0; i < grad.rows().size(); i++) {
+        PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
                        "Input rows index should less than height");
-        for (int64_t j = 0; j < in_row_numel; j++) {
-          out_data[in_rows[i] * in_row_numel + j] -=
-              lr[0] * in_data[i * in_row_numel + j];
+        int64_t id_index = param.index(grad.rows()[i]);
+        for (int64_t j = 0; j < grad_row_width; j++) {
+          out_data[id_index * grad_row_width + j] -=
+              lr[0] * grad_data[i * grad_row_width + j];
         }
       }
-
     } else {
-      PADDLE_THROW("Unsupported Variable Type of Grad");
+      PADDLE_THROW("Unsupported Variable Type of Parameter");
     }
   }
 };
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index e2c0f915d96b7746191572fa27b725d90cb6e2e5..6bdefc0f23910c90f3878d8f2634ca6e03c6f736 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/softmax_op.h"
+
+#include <string>
+
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
@@ -20,6 +23,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+
 namespace paddle {
 namespace operators {
 
@@ -60,8 +64,8 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     auto input_data_type =
         framework::ToDataType(ctx.Input<Tensor>("X")->type());
     if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
-                        "float16 can only be used when CUDNN is used");
+      PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                     "float16 can only be used on GPU place");
     }
 
     std::string data_format = ctx.Attr<std::string>("data_format");
@@ -70,6 +74,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
                                    library_);
   }
 };
+
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc
index dbd13fd38a33d4068a5b5d47cd92a81293f6e748..0c1f7cef7ab7b66358d80f6f0670e0d07536128c 100644
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    softmax, ops::SoftmaxKernel<paddle::platform::CUDADeviceContext, float>);
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    softmax_grad,
-    ops::SoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
+    softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>,
+    ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(softmax_grad,
+                        ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index 3e750ed2d171876ce2d3c232f5d34234217b3c3e..d36ed398ebce661a62ca92696b0089b5289d5b1c 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -30,19 +30,16 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       PADDLE_THROW("SplitIds do not support GPU kernel");
     }
 
-    const auto* ids_t = ctx.Input<framework::LoDTensor>("Ids");
-    auto& ids_dims = ids_t->dims();
+    auto& ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
+    const T* ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
     auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
-
-    const T* ids = ids_t->data<T>();
-
     const size_t shard_num = outs.size();
 
     std::vector<std::vector<T>> out_ids;
     out_ids.resize(outs.size());
 
     // split id by their shard_num.
-    for (size_t i = 0; i < ids_dims[0]; ++i) {
+    for (int i = 0; i < ids_dims[0]; ++i) {
       T id = ids[i];
       size_t shard_id = static_cast<size_t>(id) % shard_num;
       out_ids[shard_id].push_back(id);
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
index f1c4415f27d54ad09e5cb3659bd16abd82e38215..8c55b4ebbc88f696e99b1194055bed3b0d0b3f0b 100644
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/spp_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index 3d2f22632570fe2a28a822370a400390c78b533a..08cb7849d20443862b66ea6096c095b294c7242c 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index d3d5c8a3429e2070c5472355b4440401eaa699cb..9061e137bd1c789d34665729c48c1c2ea9525c8e 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sum_op.h"
+#include <algorithm>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index e7e5346cdca5efaf81c2b0fddedde7406e3b874d..49a4afb3a8a19c97e844e66477c6288772ece807 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 42828b7e6564d7da91d608d63fbc0615ef6c4f97..9f8482adedb4c29e32d4109941a2752d942ae49f 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <iostream>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 87b1f530e08df7022d112b26e28511a982052126..4aea9cd65bed615c84c95d891a0a4092678e1444 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/transpose_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index 90f16499a6f52514bfed3dbeb4176ccc956b23d7..895d1ce2cca19c0c1e4aa03cc64eb1425e8bab1a 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 0ca7ea00fafc5cf7ab240e1e41710d3b791dfbfb..31859fd1d70dc6e6387258cd5f7412e78a302567 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unpool_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h
index a4421045756bd39728fc14c06efd11a56c7e55af..96abad3de9b959ee611355c67f1fa9e56c430b1b 100644
--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/unpooling.h"
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index 3e3e3089315ab9365925c38b9bce5fb0120d37c3..afbfe69973830bde93ec0af8d1c844580a786663 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
diff --git a/paddle/fluid/platform/.clang-format b/paddle/fluid/platform/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 686c0889140f0050b37192542ca98e2f3e5f23df..917bdc64abf608b8ade70c47f76a8adffb32046a 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -6,8 +6,8 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _
 add_dependencies(profiler_py_proto profiler_py_proto_init)
 
 add_custom_command(TARGET profiler_py_proto POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
-        COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
         COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
@@ -42,12 +42,12 @@ ENDIF()
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
-    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+cc_library(device_context SRCS device_context.cc DEPS malloc
+    place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
-nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
+nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
diff --git a/paddle/fluid/platform/call_once.h b/paddle/fluid/platform/call_once.h
deleted file mode 100644
index fa34972c38d6e7f77a7e178d68592f9886748fa1..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/call_once.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>
-
-namespace paddle {
-namespace platform {
-
-/*
- The current implementation of std::call_once has a bug described in
- https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call.
- This is likely caused by a deeper bug of pthread_once, which is discussed in
- https://patchwork.ozlabs.org/patch/482350/
-
- This wrap is a hack to avoid this bug.
-*/
-template <typename Callable, typename... Args>
-inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
-  bool good = true;
-  std::exception ex;
-  try {
-    std::call_once(flag,
-                   [&](Args&&... args) {
-                     try {
-                       f(args...);
-                     } catch (const std::exception& e) {
-                       ex = e;
-                       good = false;
-                     } catch (...) {
-                       ex = std::runtime_error("excption caught in call_once");
-                       good = false;
-                     }
-                   },
-                   args...);
-  } catch (std::system_error& x) {
-    throw std::runtime_error("call once failed");
-  }
-  if (!good) {
-    throw std::exception(ex);
-  }
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 8db08edba805e41d33ec6a6a4b338cca0d4906ef..4fc9aae8e36e9b43d65fab0b92ec3a2549057128 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -27,6 +27,11 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
               "Default use 100% of CPU memory for PaddlePaddle,"
               "reserve the rest for page tables, etc");
 
+DEFINE_double(
+    fraction_of_cuda_pinned_memory_to_use, 0.5,
+    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
+    "reserve the rest for page tables, etc");
+
 namespace paddle {
 namespace platform {
 
@@ -62,5 +67,22 @@ size_t CpuMaxChunkSize() {
   return CpuMaxAllocSize() / 32;
 }
 
+size_t CUDAPinnedMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
+  return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+size_t CUDAPinnedMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 64 KB.
+  return 1 << 16;
+}
+
+size_t CUDAPinnedMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 1/256 of CUDA_PINNED
+  // memory.
+  return CUDAPinnedMaxAllocSize() / 256;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index a930151bd15a33d5b8861c6239e7dd964822f0f6..f06c2b67fe4385f427322e9bb2f3080fdd3acc94 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -22,11 +22,20 @@ namespace platform {
 //! Get the maximum allocation size for a machine.
 size_t CpuMaxAllocSize();
 
+//! Get the maximum allocation size for a machine.
+size_t CUDAPinnedMaxAllocSize();
+
 //! Get the minimum chunk size for buddy allocator.
 size_t CpuMinChunkSize();
 
 //! Get the maximum chunk size for buddy allocator.
 size_t CpuMaxChunkSize();
 
+//! Get the minimum chunk size for buddy allocator.
+size_t CUDAPinnedMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t CUDAPinnedMaxChunkSize();
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info_test.cc b/paddle/fluid/platform/cpu_info_test.cc
index 78332f90cd96d80cca0cf865f4815aaf18463253..aac882e846309f23f49f68aba805da0857c7fb2d 100644
--- a/paddle/fluid/platform/cpu_info_test.cc
+++ b/paddle/fluid/platform/cpu_info_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/string/printf.h"
 
 #include <ostream>
 #include <sstream>
@@ -20,6 +19,7 @@
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/string/printf.h"
 
 DECLARE_double(fraction_of_cpu_memory_to_use);
 
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 7c604e14eb245232ed92f53a00b9bde45c2fbaec..c0d399d078f73743836fc2a0c1d4b1e6b31ecd83 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -257,9 +257,11 @@ class ScopedConvolutionDescriptor {
     }
 #endif
 
+    cudnnDataType_t compute_type =
+        (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
     PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
         desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
-        CUDNN_CROSS_CORRELATION, type));
+        CUDNN_CROSS_CORRELATION, compute_type));
     return desc_;
   }
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 59b76a1edb5ec5900520fbccb6a6f8f6e7a70aa4..feb4f367008d76d86a93c561a8eec1f2485c99d6 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -53,6 +53,16 @@ DeviceContextPool::DeviceContextPool(
       PADDLE_THROW(
           "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
           "option");
+#endif
+    } else if (platform::is_cuda_pinned_place(p)) {
+#ifdef PADDLE_WITH_CUDA
+      device_contexts_.emplace(
+          p,
+          PtrType(new CUDAPinnedDeviceContext(boost::get<CUDAPinnedPlace>(p))));
+#else
+      PADDLE_THROW(
+          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
+          "option");
 #endif
     }
   }
@@ -186,6 +196,20 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
 
 cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 
+CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+CUDAPinnedDeviceContext::CUDAPinnedDeviceContext(CUDAPinnedPlace place)
+    : place_(place) {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 202394c7be7e103a609dd0999fc883c794ef0edd..6b796d92d09cdde2db60c7651c03d3782ff013e6 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -118,6 +118,25 @@ struct DefaultDeviceContextType<platform::CUDAPlace> {
   using TYPE = CUDADeviceContext;
 };
 
+// Currently, CUDAPinnedDeviceContext is only used to data copying.
+class CUDAPinnedDeviceContext : public DeviceContext {
+ public:
+  CUDAPinnedDeviceContext();
+  explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
+
+  Place GetPlace() const override;
+
+  Eigen::DefaultDevice* eigen_device() const;
+
+ private:
+  CUDAPinnedPlace place_;
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+};
+
+template <>
+struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
+  using TYPE = CUDAPinnedDeviceContext;
+};
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc
index e90e3105f0809b3c7507a86fa5a3d61864290fcb..eb541579a136de2a84ecc9773e0c312b405f7e86 100644
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -24,6 +24,10 @@ void *cublas_dso_handle = nullptr;
 
 CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
 
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
+CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index fa9041134d863ebfd8d1e00379da3b92323ae6e3..a41018d350e89881888d5e31089c2b9ecd76f6c0 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -1,22 +1,23 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #pragma once
 
 #include <cublas_v2.h>
+#include <cuda.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -34,18 +35,18 @@ extern void *cublas_dso_handle;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    inline cublasStatus_t operator()(Args... args) {                \
-      typedef cublasStatus_t (*cublasFunc)(Args...);                \
-      std::call_once(cublas_dso_flag,                               \
-                     paddle::platform::dynload::GetCublasDsoHandle, \
-                     &cublas_dso_handle);                           \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
-      return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
-    }                                                               \
-  };                                                                \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    inline cublasStatus_t operator()(Args... args) {                         \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                         \
+      std::call_once(cublas_dso_flag, []() {                                 \
+        cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
+      });                                                                    \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);                  \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);              \
+    }                                                                        \
+  };                                                                         \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
@@ -70,6 +71,7 @@ extern void *cublas_dso_handle;
   __macro(cublasDgemm_v2);                \
   __macro(cublasHgemm);                   \
   __macro(cublasSgemmEx);                 \
+  __macro(cublasGemmEx);                  \
   __macro(cublasSgeam_v2);                \
   __macro(cublasDgeam_v2);                \
   __macro(cublasCreate_v2);               \
@@ -89,9 +91,15 @@ extern void *cublas_dso_handle;
   __macro(cublasSgetrfBatched);           \
   __macro(cublasSgetriBatched);           \
   __macro(cublasDgetrfBatched);           \
-  __macro(cublasDgetriBatched)
+  __macro(cublasDgetriBatched);
 
-CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+
+// APIs available after CUDA 9.0
+#if CUDA_VERSION >= 9000
+#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) __macro(cublasSetMathMode);
+CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
 
 #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index c65b060ab46cfcd38292be66dd5f2123f88bae63..f3cd3b2bbedef7c9140c2acddea0732972ff7fa0 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -44,7 +44,8 @@ CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 
 #ifdef PADDLE_USE_DSO
 bool HasCUDNN() {
-  std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle);
+  std::call_once(cudnn_dso_flag,
+                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
   return cudnn_dso_handle != nullptr;
 }
 
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 81acc445bd3803dede158ff09507a72fb6e293ac..24475b62ca2825c45ff7edb39328dece3b822b25 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <cudnn.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -30,19 +30,19 @@ extern bool HasCUDNN();
 #ifdef PADDLE_USE_DSO
 
 extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    auto operator()(Args... args) -> decltype(__name(args...)) {   \
-      using cudnn_func = decltype(__name(args...)) (*)(Args...);   \
-      std::call_once(cudnn_dso_flag,                               \
-                     paddle::platform::dynload::GetCUDNNDsoHandle, \
-                     &cudnn_dso_handle);                           \
-      EnforceCUDNNLoaded(#__name);                                 \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
-    }                                                              \
-  };                                                               \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);           \
+      std::call_once(cudnn_dso_flag, []() {                                \
+        cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
+      });                                                                  \
+      EnforceCUDNNLoaded(#__name);                                         \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                 \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);            \
+    }                                                                      \
+  };                                                                       \
   extern struct DynLoad__##__name __name
 
 #else
@@ -140,7 +140,8 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
 #if CUDNN_VERSION >= 7001
 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
-  __macro(cudnnSetConvolutionGroupCount);
+  __macro(cudnnSetConvolutionGroupCount);  \
+  __macro(cudnnSetConvolutionMathType);
 CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index c1bf88f8cb690861b97686d99d36410143445243..d0d676b9d8ac462900b48246bec43166d04ef97b 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -11,14 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #ifdef PADDLE_WITH_CUPTI
+
 #include <cuda.h>
 #include <cupti.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -36,18 +37,18 @@ extern void *cupti_dso_handle;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                    \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    inline CUptiResult CUPTIAPI operator()(Args... args) {         \
-      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);          \
-      std::call_once(cupti_dso_flag,                               \
-                     paddle::platform::dynload::GetCUPTIDsoHandle, \
-                     &cupti_dso_handle);                           \
-      void *p_##__name = dlsym(cupti_dso_handle, #__name);         \
-      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);     \
-    }                                                              \
-  };                                                               \
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                            \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {                 \
+      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);                  \
+      std::call_once(cupti_dso_flag, []() {                                \
+        cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
+      });                                                                  \
+      void *p_##__name = dlsym(cupti_dso_handle, #__name);                 \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);             \
+    }                                                                      \
+  };                                                                       \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)            \
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 1b3ff962d6edceb37deb94cc7daead7346d25352..4697fb6cd96770127206bdabeea77e43eb09d1f5 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <curand.h>
 #include <dlfcn.h>
-#include <mutex>
+
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -25,18 +26,18 @@ namespace dynload {
 extern std::once_flag curand_dso_flag;
 extern void *curand_dso_handle;
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    curandStatus_t operator()(Args... args) {                       \
-      typedef curandStatus_t (*curandFunc)(Args...);                \
-      std::call_once(curand_dso_flag,                               \
-                     paddle::platform::dynload::GetCurandDsoHandle, \
-                     &curand_dso_handle);                           \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);         \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
-    }                                                               \
-  };                                                                \
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    curandStatus_t operator()(Args... args) {                                \
+      typedef curandStatus_t (*curandFunc)(Args...);                         \
+      std::call_once(curand_dso_flag, []() {                                 \
+        curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
+      });                                                                    \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);                  \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
+    }                                                                        \
+  };                                                                         \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index e590e81bab51fd9fe12309335522614263d8e21d..3c1ccc7445ed27c711ab250aa223c66ae0da45dc 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -11,12 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
 #include <dlfcn.h>
+
 #include <memory>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include <string>
+
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
@@ -65,22 +67,21 @@ static inline std::string join(const std::string& part1,
   return ret;
 }
 
-static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
-                                               void** dso_handle,
-                                               int dynload_flags) {
+static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
+                                                int dynload_flags) {
   VLOG(3) << "Try to find library: " << dso_path
           << " from default system path.";
   // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
 
 // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
 // bring System Integrity Projection (SIP), if dso_handle
 // is null, search from default package path in Mac OS.
 #if defined(__APPLE__) || defined(__OSX__)
-  if (nullptr == *dso_handle) {
-    dso_path = join("/usr/local/cuda/lib/", dso_path);
-    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    if (nullptr == *dso_handle) {
+  if (nullptr == dso_handle) {
+    dso_handle =
+        dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
+    if (nullptr == dso_handle) {
       if (dso_path == "libcudnn.dylib") {
         LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
                         "For instance, sudo tar -xzf "
@@ -91,28 +92,29 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
     }
   }
 #endif
+
+  return dso_handle;
 }
 
-static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
-                                              const std::string& dso_name,
-                                              void** dso_handle,
-                                              bool throw_on_error = true) {
+static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
+                                               const std::string& dso_name,
+                                               bool throw_on_error = true) {
   int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-  *dso_handle = nullptr;
+  void* dso_handle = nullptr;
 
   std::string dlPath = dso_name;
   if (search_root.empty()) {
-    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+    dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
   } else {
     // search xxx.so from custom path
     dlPath = join(search_root, dso_name);
-    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    dso_handle = dlopen(dlPath.c_str(), dynload_flags);
     // if not found, search from default path
-    if (nullptr == *dso_handle) {
+    if (nullptr == dso_handle) {
       LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
                    << dlerror() << ")";
       dlPath = dso_name;
-      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+      dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
     }
   }
   auto error_msg =
@@ -124,70 +126,71 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
       "using the DYLD_LIBRARY_PATH is impossible unless System "
       "Integrity Protection (SIP) is disabled.";
   if (throw_on_error) {
-    PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror());
-  } else if (nullptr == *dso_handle) {
+    PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, dlerror());
+  } else if (nullptr == dso_handle) {
     LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
   }
+
+  return dso_handle;
 }
 
-void GetCublasDsoHandle(void** dso_handle) {
+void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
 }
 
-void GetCUDNNDsoHandle(void** dso_handle) {
+void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle,
-                             false);
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false);
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
 #endif
 }
 
-void GetCUPTIDsoHandle(void** dso_handle) {
+void* GetCUPTIDsoHandle() {
   std::string cupti_path = cupti_lib_path;
   if (!FLAGS_cupti_dir.empty()) {
     cupti_path = FLAGS_cupti_dir;
   }
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", dso_handle, false);
+  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
 #else
-  GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", dso_handle, false);
+  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", false);
 #endif
 }
 
-void GetCurandDsoHandle(void** dso_handle) {
+void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
 }
 
-void GetWarpCTCDsoHandle(void** dso_handle) {
+void* GetWarpCTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
 #endif
 }
 
-void GetLapackDsoHandle(void** dso_handle) {
+void* GetLapackDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so");
 #endif
 }
 
-void GetNCCLDsoHandle(void** dso_handle) {
+void* GetNCCLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
 #endif
 }
 
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index b5b9c4af916241c1c7361b506f74563ebcf69b9a..4c85093a43e0e8d75b64c5b29d1ec68db1b44909 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -18,55 +18,13 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-/**
- * @brief    load the DSO of CUBLAS
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCublasDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CUDNN
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCUDNNDsoHandle(void** dso_handle);
-
-void GetCUPTIDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CURAND
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCurandDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of warp-ctc
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetWarpCTCDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of lapack
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetLapackDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of NVIDIA nccl
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetNCCLDsoHandle(void** dso_handle);
+void* GetCublasDsoHandle();
+void* GetCUDNNDsoHandle();
+void* GetCUPTIDsoHandle();
+void* GetCurandDsoHandle();
+void* GetWarpCTCDsoHandle();
+void* GetLapackDsoHandle();
+void* GetNCCLDsoHandle();
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
index 3edc70c46d03ddcc751e865676928c47fcb48e69..2c40c48ee08497f9a2a414687b9c51d87ba574aa 100644
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -25,11 +25,6 @@ void *nccl_dso_handle;
 
 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
-void LoadNCCLDSO() {
-  platform::call_once(nccl_dso_flag,
-                      [] { GetNCCLDsoHandle(&nccl_dso_handle); });
-}
-
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index dc78bcb44d3316a1ecee0c8d70dcb4777a9e2de4..c5a10a78a4f432b431680c089f255fea777277cb 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -11,13 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <dlfcn.h>
 #include <nccl.h>
-#include <mutex>
-#include "paddle/fluid/platform/call_once.h"
+
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -28,18 +28,19 @@ extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;
 
 #ifdef PADDLE_USE_DSO
-extern void LoadNCCLDSO();
 
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);  \
-      paddle::platform::dynload::LoadNCCLDSO();                  \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);        \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
-    }                                                            \
-  };                                                             \
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                           \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    auto operator()(Args... args) -> decltype(__name(args...)) {         \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);          \
+      std::call_once(nccl_dso_flag, []() {                               \
+        nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
+      });                                                                \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);                \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
+    }                                                                    \
+  };                                                                     \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index f5ded0eb6b1107c886641e848f5040a7a2d806a5..7fa468370463a51c486b80317f401612930bc72e 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -15,9 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include <dlfcn.h>
-#include <mutex>
-#include "ctc.h"
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "warpctc/include/ctc.h"
 
 namespace paddle {
 namespace platform {
@@ -31,18 +32,18 @@ extern void* warpctc_dso_handle;
  * (for each function) to dynamic load warpctc routine
  * via operator overloading.
  */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> decltype(__name(args...)) {     \
-      using warpctcFunc = decltype(__name(args...)) (*)(Args...);    \
-      std::call_once(warpctc_dso_flag,                               \
-                     paddle::platform::dynload::GetWarpCTCDsoHandle, \
-                     &warpctc_dso_handle);                           \
-      void* p_##_name = dlsym(warpctc_dso_handle, #__name);          \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);      \
-    }                                                                \
-  };                                                                 \
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                                      \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(warpctc_dso_flag, []() {                                  \
+        warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
+      });                                                                      \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);                    \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);                \
+    }                                                                          \
+  };                                                                           \
   extern DynLoad__##__name __name
 
 #define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index d303fd6d63f8424c1c88a31eb3fa6f2136e0e430..7b8c29e1e642ec6bb4023afd8c083311b8b31812 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -16,35 +16,35 @@ limitations under the License. */
 
 #include <dlfcn.h>     // for dladdr
 #include <execinfo.h>  // for backtrace
+
+#ifdef __GNUC__
+#include <cxxabi.h>  // for __cxa_demangle
+#endif               // __GNUC__
+
+#ifdef PADDLE_WITH_CUDA
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+#endif  // PADDLE_WITH_CUDA
+
 #include <iomanip>
 #include <memory>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 
+#include "glog/logging.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/to_string.h"
 
-#ifdef __GNUC__
-#include <cxxabi.h>  // for __cxa_demangle
-#endif
-
-#include <glog/logging.h>
-
 #ifdef PADDLE_WITH_CUDA
-
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
-
-#include <cublas_v2.h>
-#include <cudnn.h>
-#include <curand.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
 #endif
 
 namespace paddle {
@@ -185,7 +185,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   }
 }
 
-#endif  // PADDLE_ONLY_CPU
+#endif  // PADDLE_WITH_CUDA
 
 template <typename T>
 inline void throw_on_error(T e) {
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index bb9a3543ff267dadf3dfee260a320d292a1ba3cb..57d751cc00b5f11f1ba1a3b0c9a6b7ce9e79f586 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -96,7 +96,6 @@ TEST(ENFORCE_GT, FAIL) {
   bool caught_exception = false;
   try {
     PADDLE_ENFORCE_GT(1, 2UL);
-
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(
@@ -115,7 +114,6 @@ TEST(ENFORCE_GE, FAIL) {
   bool caught_exception = false;
   try {
     PADDLE_ENFORCE_GE(1, 2UL);
-
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(
@@ -135,7 +133,6 @@ TEST(ENFORCE_LE, FAIL) {
   bool caught_exception = false;
   try {
     PADDLE_ENFORCE_GT(1, 2UL);
-
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(
@@ -171,7 +168,6 @@ TEST(ENFORCE_NOT_NULL, FAIL) {
   try {
     int* a = nullptr;
     PADDLE_ENFORCE_NOT_NULL(a);
-
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index 2cf311c7e56a9bbb0bdb0078d5cfefb4bb50018b..e77f768bf9f437a289b16d2ec9597c570b0a9ad2 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+#include <limits>
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
@@ -293,39 +294,39 @@ struct PADDLE_ALIGN(2) float16 {
   HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
 
   HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(float(*this));
+    return static_cast<int8_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(float(*this));
+    return static_cast<uint8_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(float(*this));
+    return static_cast<int16_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(float(*this));
+    return static_cast<uint16_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(float(*this));
+    return static_cast<int32_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(float(*this));
+    return static_cast<uint32_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(float(*this));
+    return static_cast<int64_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(float(*this));
+    return static_cast<uint64_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(float(*this));
+    return static_cast<double>(static_cast<float>(*this));
   }
 
  private:
@@ -370,7 +371,7 @@ DEVICE inline half operator+(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hadd(a, b);
 #else
-  float res = float(float16(a)) + float(float16(b));
+  float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
   return half(float16(res));
 #endif
 }
@@ -379,7 +380,7 @@ DEVICE inline half operator-(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hsub(a, b);
 #else
-  float res = float(float16(a)) - float(float16(b));
+  float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
   return half(float16(res));
 #endif
 }
@@ -388,7 +389,7 @@ DEVICE inline half operator*(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hmul(a, b);
 #else
-  float res = float(float16(a)) * float(float16(b));
+  float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
   return half(float16(res));
 #endif
 }
@@ -399,7 +400,7 @@ DEVICE inline half operator/(const half& a, const half& b) {
   float denom = __half2float(b);
   return __float2half(num / denom);
 #else
-  float res = float(float16(a)) / float(float16(b));
+  float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
   return half(float16(res));
 #endif
 }
@@ -408,27 +409,27 @@ DEVICE inline half operator-(const half& a) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hneg(a);
 #else
-  float res = -float(float16(a));
+  float res = -static_cast<float>(float16(a));
   return half(float16(res));
 #endif
 }
 
-DEVICE inline half& operator+=(half& a, const half& b) {
+DEVICE inline half& operator+=(half& a, const half& b) {  // NOLINT
   a = a + b;
   return a;
 }
 
-DEVICE inline half& operator-=(half& a, const half& b) {
+DEVICE inline half& operator-=(half& a, const half& b) {  // NOLINT
   a = a - b;
   return a;
 }
 
-DEVICE inline half& operator*=(half& a, const half& b) {
+DEVICE inline half& operator*=(half& a, const half& b) {  // NOLINT
   a = a * b;
   return a;
 }
 
-DEVICE inline half& operator/=(half& a, const half& b) {
+DEVICE inline half& operator/=(half& a, const half& b) {  // NOLINT
   a = a / b;
   return a;
 }
@@ -437,7 +438,7 @@ DEVICE inline bool operator==(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __heq(a, b);
 #else
-  return float(float16(a)) == float(float16(b));
+  return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
 #endif
 }
 
@@ -445,7 +446,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hne(a, b);
 #else
-  return float(float16(a)) != float(float16(b));
+  return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
 #endif
 }
 
@@ -453,7 +454,7 @@ DEVICE inline bool operator<(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hlt(a, b);
 #else
-  return float(float16(a)) < float(float16(b));
+  return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
 #endif
 }
 
@@ -461,7 +462,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hle(a, b);
 #else
-  return float(float16(a)) <= float(float16(b));
+  return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
 #endif
 }
 
@@ -469,7 +470,7 @@ DEVICE inline bool operator>(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hgt(a, b);
 #else
-  return float(float16(a)) > float(float16(b));
+  return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
 #endif
 }
 
@@ -477,7 +478,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hge(a, b);
 #else
-  return float(float16(a)) >= float(float16(b));
+  return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
 #endif
 }
 
@@ -489,7 +490,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return float16(__hadd(half(a), half(b)));
 #else
-  return float16(float(a) + float(b));
+  return float16(static_cast<float>(a) + static_cast<float>(b));
 #endif
 }
 
@@ -497,7 +498,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return float16(__hsub(half(a), half(b)));
 #else
-  return float16(float(a) - float(b));
+  return float16(static_cast<float>(a) - static_cast<float>(b));
 #endif
 }
 
@@ -505,7 +506,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return float16(__hmul(half(a), half(b)));
 #else
-  return float16(float(a) * float(b));
+  return float16(static_cast<float>(a) * static_cast<float>(b));
 #endif
 }
 
@@ -516,7 +517,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
   float denom = __half2float(half(b));
   return float16(num / denom);
 #else
-  return float16(float(a) / float(b));
+  return float16(static_cast<float>(a) / static_cast<float>(b));
 #endif
 }
 
@@ -530,22 +531,22 @@ HOSTDEVICE inline float16 operator-(const float16& a) {
 #endif
 }
 
-HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
   a = a + b;
   return a;
 }
 
-HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
   a = a - b;
   return a;
 }
 
-HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
   a = a * b;
   return a;
 }
 
-HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
   a = a / b;
   return a;
 }
@@ -554,7 +555,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __heq(half(a), half(b));
 #else
-  return float(a) == float(b);
+  return static_cast<float>(a) == static_cast<float>(b);
 #endif
 }
 
@@ -562,7 +563,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hne(half(a), half(b));
 #else
-  return float(a) != float(b);
+  return static_cast<float>(a) != static_cast<float>(b);
 #endif
 }
 
@@ -570,7 +571,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hlt(half(a), half(b));
 #else
-  return float(a) < float(b);
+  return static_cast<float>(a) < static_cast<float>(b);
 #endif
 }
 
@@ -578,7 +579,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hle(half(a), half(b));
 #else
-  return float(a) <= float(b);
+  return static_cast<float>(a) <= static_cast<float>(b);
 #endif
 }
 
@@ -586,7 +587,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hgt(half(a), half(b));
 #else
-  return float(a) > float(b);
+  return static_cast<float>(a) > static_cast<float>(b);
 #endif
 }
 
@@ -594,7 +595,7 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hge(half(a), half(b));
 #else
-  return float(a) >= float(b);
+  return static_cast<float>(a) >= static_cast<float>(b);
 #endif
 }
 
@@ -679,22 +680,22 @@ inline float16 operator-(const float16& a) {
   return res;
 }
 
-inline float16& operator+=(float16& a, const float16& b) {
+inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
   a = a + b;
   return a;
 }
 
-inline float16& operator-=(float16& a, const float16& b) {
+inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
   a = a - b;
   return a;
 }
 
-inline float16& operator*=(float16& a, const float16& b) {
+inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
   a = a * b;
   return a;
 }
 
-inline float16& operator/=(float16& a, const float16& b) {
+inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
   a = a / b;
   return a;
 }
@@ -784,19 +785,19 @@ inline bool operator>=(const float16& a, const float16& b) {
 // Arithmetic operators for float16, software emulated on other CPU
 #else
 inline float16 operator+(const float16& a, const float16& b) {
-  return float16(float(a) + float(b));
+  return float16(static_cast<float>(a) + static_cast<float>(b));
 }
 
 inline float16 operator-(const float16& a, const float16& b) {
-  return float16(float(a) - float(b));
+  return float16(static_cast<float>(a) - static_cast<float>(b));
 }
 
 inline float16 operator*(const float16& a, const float16& b) {
-  return float16(float(a) * float(b));
+  return float16(static_cast<float>(a) * static_cast<float>(b));
 }
 
 inline float16 operator/(const float16& a, const float16& b) {
-  return float16(float(a) / float(b));
+  return float16(static_cast<float>(a) / static_cast<float>(b));
 }
 
 inline float16 operator-(const float16& a) {
@@ -805,51 +806,57 @@ inline float16 operator-(const float16& a) {
   return res;
 }
 
-inline float16& operator+=(float16& a, const float16& b) {
-  a = float16(float(a) + float(b));
+inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) + static_cast<float>(b));
   return a;
 }
 
-inline float16& operator-=(float16& a, const float16& b) {
-  a = float16(float(a) - float(b));
+inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) - static_cast<float>(b));
   return a;
 }
 
-inline float16& operator*=(float16& a, const float16& b) {
-  a = float16(float(a) * float(b));
+inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) * static_cast<float>(b));
   return a;
 }
 
-inline float16& operator/=(float16& a, const float16& b) {
-  a = float16(float(a) / float(b));
+inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) / static_cast<float>(b));
   return a;
 }
 
 inline bool operator==(const float16& a, const float16& b) {
-  return float(a) == float(b);
+  return static_cast<float>(a) == static_cast<float>(b);
 }
 
 inline bool operator!=(const float16& a, const float16& b) {
-  return float(a) != float(b);
+  return static_cast<float>(a) != static_cast<float>(b);
 }
 
 inline bool operator<(const float16& a, const float16& b) {
-  return float(a) < float(b);
+  return static_cast<float>(a) < static_cast<float>(b);
 }
 
 inline bool operator<=(const float16& a, const float16& b) {
-  return float(a) <= float(b);
+  return static_cast<float>(a) <= static_cast<float>(b);
 }
 
 inline bool operator>(const float16& a, const float16& b) {
-  return float(a) > float(b);
+  return static_cast<float>(a) > static_cast<float>(b);
 }
 
 inline bool operator>=(const float16& a, const float16& b) {
-  return float(a) >= float(b);
+  return static_cast<float>(a) >= static_cast<float>(b);
 }
 #endif
 
+HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
+  float16 res;
+  res.x = a;
+  return res;
+}
+
 HOSTDEVICE inline bool(isnan)(const float16& a) {
 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hisnan(half(a));
@@ -886,28 +893,116 @@ struct is_pod<paddle::platform::float16> {
       is_standard_layout<paddle::platform::float16>::value;
 };
 
+template <>
+struct numeric_limits<paddle::platform::float16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 11;
+  static const int digits10 = 3;
+  static const int max_digits10 = 5;
+  static const int radix = 2;
+  static const int min_exponent = -13;
+  static const int min_exponent10 = -4;
+  static const int max_exponent = 16;
+  static const int max_exponent10 = 4;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static paddle::platform::float16(min)() {
+    return paddle::platform::raw_uint16_to_float16(0x400);
+  }
+  static paddle::platform::float16 lowest() {
+    return paddle::platform::raw_uint16_to_float16(0xfbff);
+  }
+  static paddle::platform::float16(max)() {
+    return paddle::platform::raw_uint16_to_float16(0x7bff);
+  }
+  static paddle::platform::float16 epsilon() {
+    return paddle::platform::raw_uint16_to_float16(0x0800);
+  }
+  static paddle::platform::float16 round_error() {
+    return paddle::platform::float16(0.5);
+  }
+  static paddle::platform::float16 infinity() {
+    return paddle::platform::raw_uint16_to_float16(0x7c00);
+  }
+  static paddle::platform::float16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7e00);
+  }
+  static paddle::platform::float16 signaling_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7e00);
+  }
+  static paddle::platform::float16 denorm_min() {
+    return paddle::platform::raw_uint16_to_float16(0x1);
+  }
+};
+
 }  // namespace std
 
 namespace Eigen {
+
+using float16 = paddle::platform::float16;
+
+template <>
+struct NumTraits<float16> : GenericNumTraits<float16> {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  HOSTDEVICE static inline float16 epsilon() {
+    return paddle::platform::raw_uint16_to_float16(0x0800);
+  }
+  HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
+  HOSTDEVICE static inline float16 highest() {
+    return paddle::platform::raw_uint16_to_float16(0x7bff);
+  }
+  HOSTDEVICE static inline float16 lowest() {
+    return paddle::platform::raw_uint16_to_float16(0xfbff);
+  }
+  HOSTDEVICE static inline float16 infinity() {
+    return paddle::platform::raw_uint16_to_float16(0x7c00);
+  }
+  HOSTDEVICE static inline float16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7c01);
+  }
+};
+
 namespace numext {
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(
-    const paddle::platform::float16& a) {
+HOSTDEVICE inline bool(isnan)(const float16& a) {
   return (paddle::platform::isnan)(a);
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(
-    const paddle::platform::float16& a) {
+HOSTDEVICE inline bool(isinf)(const float16& a) {
   return (paddle::platform::isinf)(a);
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(
-    const paddle::platform::float16& a) {
+HOSTDEVICE inline bool(isfinite)(const float16& a) {
   return (paddle::platform::isfinite)(a);
 }
 
+template <>
+HOSTDEVICE inline float16 exp(const float16& a) {
+  return float16(::expf(static_cast<float>(a)));
+}
+
 }  // namespace numext
+
 }  // namespace Eigen
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index b716ad9df41330bd6e22937381d24e33fa3a7914..d60aecf96c8828a5656f81fd3602cfb2e66990cf 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/platform/float16.h"
+
+#include <vector>
+
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 
-#include <gtest/gtest.h>
-
 namespace paddle {
 namespace platform {
 
@@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) {
 
   // Conversion operator
   EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
-  EXPECT_EQ(float(float16(0.5f)), 0.5f);
-  EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001);
-  EXPECT_EQ(int(float16(-1)), -1);
-  EXPECT_EQ(bool(float16(true)), true);
+  EXPECT_EQ(static_cast<float>(float16(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(float16(0.33333)), 0.33333, 0.0001);
+  EXPECT_EQ(static_cast<int>(float16(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(float16(true)), true);
 }
 
 TEST(float16, arithmetic_cpu) {
-  EXPECT_EQ(float(float16(1) + float16(1)), 2);
-  EXPECT_EQ(float(float16(5) + float16(-5)), 0);
-  EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001);
-  EXPECT_EQ(float(float16(3) - float16(5)), -2);
-  EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001);
-  EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
-  EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
-  EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001);
-  EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f);
-  EXPECT_EQ(float(-float16(512.0f)), -512.0f);
-  EXPECT_EQ(float(-float16(-512.0f)), 512.0f);
+  EXPECT_EQ(static_cast<float>(float16(1) + float16(1)), 2);
+  EXPECT_EQ(static_cast<float>(float16(5) + float16(-5)), 0);
+  EXPECT_NEAR(static_cast<float>(float16(0.33333f) + float16(0.66667f)), 1.0f,
+              0.001);
+  EXPECT_EQ(static_cast<float>(float16(3) - float16(5)), -2);
+  EXPECT_NEAR(static_cast<float>(float16(0.66667f) - float16(0.33333f)),
+              0.33334f, 0.001);
+  EXPECT_NEAR(static_cast<float>(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(static_cast<float>(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
+  EXPECT_NEAR(static_cast<float>(float16(2.0f) / float16(3.0f)), 0.66667f,
+              0.001);
+  EXPECT_EQ(static_cast<float>(float16(1.0f) / float16(2.0f)), 0.5f);
+  EXPECT_EQ(static_cast<float>(-float16(512.0f)), -512.0f);
+  EXPECT_EQ(static_cast<float>(-float16(-512.0f)), 512.0f);
 }
 
 TEST(float16, comparison_cpu) {
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 567209df4edc483bcb5c6264c62034ddff50c413..577fc24ceb1d3c83cc0546dc5db9c8c7c1f01f86 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -36,19 +36,19 @@ limitations under the License. */
     half *in1, *in2, *out;                                    \
     half *d_in1, *d_in2, *d_out;                              \
     int size = sizeof(half);                                  \
-    cudaMalloc((void**)&d_in1, size);                         \
-    cudaMalloc((void**)&d_in2, size);                         \
-    cudaMalloc((void**)&d_out, size);                         \
-    in1 = (half*)malloc(size);                                \
-    in2 = (half*)malloc(size);                                \
-    out = (half*)malloc(size);                                \
+    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
+    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
+    cudaMalloc(reinterpret_cast<void**>(&d_out), size);       \
+    in1 = reinterpret_cast<half*>(malloc(size));              \
+    in2 = reinterpret_cast<half*>(malloc(size));              \
+    out = reinterpret_cast<half*>(malloc(size));              \
     in1[0] = half(float16(v_in1));                            \
     in2[0] = half(float16(v_in2));                            \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
     cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
     op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
     cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(float(float16(out[0])), v_out);                 \
+    EXPECT_EQ(static_cast<float>(float16(out[0])), v_out);    \
     free(in1);                                                \
     free(in2);                                                \
     free(out);                                                \
@@ -63,17 +63,17 @@ limitations under the License. */
     half *in1, *in2;                                          \
     half *d_in1, *d_in2;                                      \
     int size = sizeof(half);                                  \
-    cudaMalloc((void**)&d_in1, size);                         \
-    cudaMalloc((void**)&d_in2, size);                         \
-    in1 = (half*)malloc(size);                                \
-    in2 = (half*)malloc(size);                                \
+    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
+    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
+    in1 = reinterpret_cast<half*>(malloc(size));              \
+    in2 = reinterpret_cast<half*>(malloc(size));              \
     in1[0] = half(float16(v_in1));                            \
     in2[0] = half(float16(v_in2));                            \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
     cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
     op_type<<<1, 1>>>(d_in1, d_in2);                          \
     cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(float(float16(in1[0])), v_out);                 \
+    EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out);    \
     free(in1);                                                \
     free(in2);                                                \
     cudaFree(d_in1);                                          \
@@ -87,12 +87,12 @@ limitations under the License. */
     half *d_in1, *d_in2;                                     \
     bool *out, *d_out;                                       \
     int size = sizeof(half);                                 \
-    cudaMalloc((void**)&d_in1, size);                        \
-    cudaMalloc((void**)&d_in2, size);                        \
-    cudaMalloc((void**)&d_out, 1);                           \
-    in1 = (half*)malloc(size);                               \
-    in2 = (half*)malloc(size);                               \
-    out = (bool*)malloc(1);                                  \
+    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);      \
+    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);      \
+    cudaMalloc(reinterpret_cast<void**>(&d_out), 1);         \
+    in1 = reinterpret_cast<half*>(malloc(size));             \
+    in2 = reinterpret_cast<half*>(malloc(size));             \
+    out = reinterpret_cast<bool*>(malloc(1));                \
     in1[0] = half(float16(v_in1));                           \
     in2[0] = half(float16(v_in2));                           \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
@@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) {
   LOG(INFO) << "Test Neg on GPU!";
   half *in, *d_in;
   int size = sizeof(half);
-  cudaMalloc((void**)&d_in, size);
-  in = (half*)malloc(size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in), size);
+  in = reinterpret_cast<half*>(malloc(size));
   in[0] = half(float16(v_in));
   cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
   Neg<<<1, 1>>>(d_in);
   cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
-  EXPECT_EQ(float(float16(in[0])), v_out);
+  EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
   free(in);
   cudaFree(d_in);
 }
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index dd70ff9ff574b32bc96a9e8255b1bf77a5cc84e4..aaebeb1353a13ab16fcf98f10da59d41fd2f5b48 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -14,8 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/gpu_info.h"
 
-#include "gflags/gflags.h"
+#include <algorithm>
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
 
 DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
@@ -77,8 +78,8 @@ void SetDeviceId(int id) {
                  "cudaSetDevice failed in paddle::platform::SetDeviceId");
 }
 
-void GpuMemoryUsage(size_t &available, size_t &total) {
-  PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
+void GpuMemoryUsage(size_t *available, size_t *total) {
+  PADDLE_ENFORCE(cudaMemGetInfo(available, total),
                  "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
 }
 
@@ -86,7 +87,7 @@ size_t GpuMaxAllocSize() {
   size_t total = 0;
   size_t available = 0;
 
-  GpuMemoryUsage(available, total);
+  GpuMemoryUsage(&available, &total);
 
   // Reserve the rest for page tables, etc.
   return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
@@ -101,7 +102,7 @@ size_t GpuMaxChunkSize() {
   size_t total = 0;
   size_t available = 0;
 
-  GpuMemoryUsage(available, total);
+  GpuMemoryUsage(&available, &total);
   VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
            << total / 1024 / 1024 << "M";
   size_t reserving = static_cast<size_t>(0.05 * total);
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index fa469fa77f5ca780da153cc87da8d04f239711f3..36345e17406e22970806fa274d5a73a703517c43 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -23,10 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-//! Environment variable: fraction of GPU memory to use on each device.
-const std::string kEnvFractionGpuMemoryToUse =
-    "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
-
 //! Get the total number of GPU devices in system.
 int GetCUDADeviceCount();
 
@@ -46,7 +42,7 @@ int GetCurrentDeviceId();
 void SetDeviceId(int device_id);
 
 //! Get the memory usage of current GPU device.
-void GpuMemoryUsage(size_t &available, size_t &total);
+void GpuMemoryUsage(size_t *available, size_t *total);
 
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index de8f958eb012cb1ac563cbbbac8951e439bf8f33..655ce8485d4584aa0955315b045da6bf541f7fe2 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -26,6 +26,7 @@ class PlacePrinter : public boost::static_visitor<> {
   void operator()(const CUDAPlace &p) {
     os_ << "CUDAPlace(" << p.device << ")";
   }
+  void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
 
  private:
   std::ostream &os_;
@@ -40,12 +41,19 @@ const Place &get_place() { return the_default_place; }
 
 const CUDAPlace default_gpu() { return CUDAPlace(0); }
 const CPUPlace default_cpu() { return CPUPlace(); }
+const CUDAPinnedPlace default_cuda_pinned() { return CUDAPinnedPlace(); }
 
 bool is_gpu_place(const Place &p) {
   return boost::apply_visitor(IsCUDAPlace(), p);
 }
 
-bool is_cpu_place(const Place &p) { return !is_gpu_place(p); }
+bool is_cpu_place(const Place &p) {
+  return boost::apply_visitor(IsCPUPlace(), p);
+}
+
+bool is_cuda_pinned_place(const Place &p) {
+  return boost::apply_visitor(IsCUDAPinnedPlace(), p);
+}
 
 bool places_are_same_class(const Place &p1, const Place &p2) {
   return p1.which() == p2.which();
@@ -53,7 +61,7 @@ bool places_are_same_class(const Place &p1, const Place &p2) {
 
 bool is_same_place(const Place &p1, const Place &p2) {
   if (places_are_same_class(p1, p2)) {
-    if (is_cpu_place(p1)) {
+    if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
       return true;
     } else {
       return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index 4cc8b377b8b671eb5a446ecbae21ba9628fbd2c8..ad54a878996bd36f2d714f6554b44c89dae3fd0c 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -11,10 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <iostream>
+#include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/variant.h"
 
@@ -45,12 +46,33 @@ struct CUDAPlace {
   int device;
 };
 
+struct CUDAPinnedPlace {
+  CUDAPinnedPlace() {}
+
+  // needed for variant equality comparison
+  inline bool operator==(const CUDAPinnedPlace &) const { return true; }
+  inline bool operator!=(const CUDAPinnedPlace &) const { return false; }
+};
+
 struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &gpu) const { return true; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
-typedef boost::variant<CUDAPlace, CPUPlace> Place;
+struct IsCPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &cpu) const { return true; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+
+struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
+};
+
+typedef boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> Place;
 
 using PlaceList = std::vector<Place>;
 
@@ -59,9 +81,11 @@ const Place &get_place();
 
 const CUDAPlace default_gpu();
 const CPUPlace default_cpu();
+const CUDAPinnedPlace default_cuda_pinned();
 
 bool is_gpu_place(const Place &);
 bool is_cpu_place(const Place &);
+bool is_cuda_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 bool is_same_place(const Place &, const Place &);
 
@@ -95,6 +119,16 @@ struct PlaceVisitorWrapper
 #else
     PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device");
     return typename Visitor::result_type();
+#endif
+  }
+
+  typename Visitor::result_type operator()(
+      const CUDAPinnedPlace &cuda_pinned) const {
+#ifdef PADDLE_WITH_CUDA
+    return visitor_(cuda_pinned);
+#else
+    PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda_pinned");
+    return typename Visitor::result_type();
 #endif
   }
 };
diff --git a/paddle/fluid/pybind/.clang-format b/paddle/fluid/pybind/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/fluid/pybind/.gitignore b/paddle/fluid/pybind/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..8f222791edb016df65be5db75831f5f83cf63726
--- /dev/null
+++ b/paddle/fluid/pybind/.gitignore
@@ -0,0 +1 @@
+pybind.h
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index ada69ea4a425f70dc085ad9046bb6b930136803d..884289a7fda65f9713392ec459219b4c89271e73 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,17 +2,19 @@ if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
   else()
     cc_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
     if(NOT APPLE AND NOT ANDROID)
       target_link_libraries(paddle_pybind rt)
     endif(NOT APPLE AND NOT ANDROID)
   endif(WITH_AMD_GPU)
+
+  cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
 endif(WITH_PYTHON)
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 6657b25ed2443c1ac9cb0a09098968d3181fc6ba..3f28e616494ad1322708ad6403aaf50b22d724e6 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "const_value.h"
+#include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace pybind {
 
-void BindConstValue(pybind11::module& m) {
-  m.def("kEmptyVarName", [] { return framework::kEmptyVarName; });
-  m.def("kTempVarName", [] { return framework::kTempVarName; });
-  m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
-  m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+void BindConstValue(pybind11::module* m) {
+  m->def("kEmptyVarName", [] { return framework::kEmptyVarName; });
+  m->def("kTempVarName", [] { return framework::kTempVarName; });
+  m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
+  m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/const_value.h b/paddle/fluid/pybind/const_value.h
index 79e71e039dea6585aaf8193f1417c6ab3fbf6f76..2fab3160d1d95af7f6a49c472c2e211c19e67cac 100644
--- a/paddle/fluid/pybind/const_value.h
+++ b/paddle/fluid/pybind/const_value.h
@@ -11,16 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include <Python.h>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "pybind11/pybind11.h"
 
-namespace py = pybind11;
-
 namespace paddle {
 namespace pybind {
-extern void BindConstValue(pybind11::module& m);
+
+void BindConstValue(pybind11::module* m);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 4bd3ecf728dedaf74a554f77b114065f2d515786..08a2f185e117718d07ba984f76dfe5bf8229c33c 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
-void BindException(pybind11::module& m) {
-  static pybind11::exception<platform::EnforceNotMet> exc(m, "EnforceNotMet");
+void BindException(pybind11::module* m) {
+  static pybind11::exception<platform::EnforceNotMet> exc(*m, "EnforceNotMet");
   pybind11::register_exception_translator([](std::exception_ptr p) {
     try {
       if (p) std::rethrow_exception(p);
@@ -27,7 +27,8 @@ void BindException(pybind11::module& m) {
     }
   });
 
-  m.def("__unittest_throw_exception__", [] { PADDLE_THROW("test exception"); });
+  m->def("__unittest_throw_exception__",
+         [] { PADDLE_THROW("test exception"); });
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/exception.h b/paddle/fluid/pybind/exception.h
index bc6b0c067978959d4cdafec51db9574927b34b21..5e054267361f2c62b3ad36581be0ad17ce0718de 100644
--- a/paddle/fluid/pybind/exception.h
+++ b/paddle/fluid/pybind/exception.h
@@ -11,14 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include <Python.h>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "pybind11/pybind11.h"
+
 namespace paddle {
 namespace pybind {
 
-extern void BindException(pybind11::module& m);
+void BindException(pybind11::module* m);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 45a64f43846e79c27295e52c59dca6bdfaa120a3..2fe829036386086075a7f6ad0b9348a9e8c5e85a 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -11,10 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/pybind/protobuf.h"
+
 #include <deque>
 #include <iostream>
+#include <string>
+#include <tuple>
+
 #include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
@@ -95,10 +98,11 @@ struct type_caster<boost::variant<Args...>>
 namespace paddle {
 namespace pybind {
 
-using namespace paddle::framework;  // NOLINT
+namespace pd = paddle::framework;
 
 template <typename T>
-static py::bytes SerializeMessage(T &self) {
+static pybind11::bytes SerializeMessage(
+    T &self) {  // NOLINT due to pybind11 convention.
   // Check IsInitialized in Python
   std::string retv;
   PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv),
@@ -107,24 +111,24 @@ static py::bytes SerializeMessage(T &self) {
 }
 
 // Bind Methods
-void BindProgramDesc(py::module &m) {
-  py::class_<ProgramDesc>(m, "ProgramDesc", "")
-      .def(py::init<>())
+void BindProgramDesc(pybind11::module *m) {
+  pybind11::class_<pd::ProgramDesc>(*m, "ProgramDesc", "")
+      .def(pybind11::init<>())
       .def("__init__",
-           [](ProgramDesc &self, const ProgramDesc &other) {
-             new (&self) ProgramDesc(other);
+           [](pd::ProgramDesc &self, const pd::ProgramDesc &other) {
+             new (&self) pd::ProgramDesc(other);
            })
       .def("__init__",
-           [](ProgramDesc &self, const py::bytes &binary_str) {
+           [](pd::ProgramDesc &self, const pybind11::bytes &binary_str) {
              std::string str(binary_str);
-             new (&self) ProgramDesc(str);
+             new (&self) pd::ProgramDesc(str);
            })
-      .def("append_block", &ProgramDesc::AppendBlock,
-           py::return_value_policy::reference)
+      .def("append_block", &pd::ProgramDesc::AppendBlock,
+           pybind11::return_value_policy::reference)
       .def("append_backward",
-           [](ProgramDesc &program_desc, const VarDesc &target,
+           [](pd::ProgramDesc &program_desc, const pd::VarDesc &target,
               const std::unordered_set<std::string> &no_grad_vars) {
-             ParamGradInfoMap param_grad_map =
+             pd::ParamGradInfoMap param_grad_map =
                  AppendBackward(program_desc, target, no_grad_vars);
              std::unordered_map<
                  std::string, std::tuple<std::string /* grad_var_name */,
@@ -138,172 +142,184 @@ void BindProgramDesc(py::module &m) {
              }
              return retv;
            })
-      .def("block", &ProgramDesc::MutableBlock,
-           py::return_value_policy::reference)
-      .def("num_blocks", &ProgramDesc::Size)
-      .def("serialize_to_string", SerializeMessage<ProgramDesc>)
+      .def("block", &pd::ProgramDesc::MutableBlock,
+           pybind11::return_value_policy::reference)
+      .def("num_blocks", &pd::ProgramDesc::Size)
+      .def("serialize_to_string", SerializeMessage<pd::ProgramDesc>)
       .def("parse_from_string",
-           [](ProgramDesc &program_desc, const std::string &data) {
-             proto::ProgramDesc *desc = program_desc.Proto();
+           [](pd::ProgramDesc &program_desc, const std::string &data) {
+             pd::proto::ProgramDesc *desc = program_desc.Proto();
              PADDLE_ENFORCE(desc->ParseFromString(data),
                             "Fail to parse ProgramDesc from string. This could "
                             "be a bug of Paddle.");
            });
 }
 
-void BindBlockDesc(py::module &m) {
-  py::class_<BlockDesc>(m, "BlockDesc", "")
-      .def_property_readonly("id", &BlockDesc::ID)
-      .def_property_readonly("parent", &BlockDesc::Parent)
-      .def("get_forward_block_idx", &BlockDesc::ForwardBlockID)
-      .def("set_forward_block_idx", &BlockDesc::SetForwardBlockID)
-      .def("append_op", &BlockDesc::AppendOp,
-           py::return_value_policy::reference)
-      .def("prepend_op", &BlockDesc::PrependOp,
-           py::return_value_policy::reference)
-      .def("insert_op", &BlockDesc::InsertOp,
-           py::return_value_policy::reference)
-      .def("remove_op", &BlockDesc::RemoveOp)
+void BindBlockDesc(pybind11::module *m) {
+  pybind11::class_<pd::BlockDesc>(*m, "BlockDesc", "")
+      .def_property_readonly("id", &pd::BlockDesc::ID)
+      .def_property_readonly("parent", &pd::BlockDesc::Parent)
+      .def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID)
+      .def("set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
+      .def("append_op", &pd::BlockDesc::AppendOp,
+           pybind11::return_value_policy::reference)
+      .def("prepend_op", &pd::BlockDesc::PrependOp,
+           pybind11::return_value_policy::reference)
+      .def("insert_op", &pd::BlockDesc::InsertOp,
+           pybind11::return_value_policy::reference)
+      .def("remove_op", &pd::BlockDesc::RemoveOp)
       .def("var",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.Var(name);
            },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
       .def("has_var",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.HasVar(name);
            },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
       .def("rename_var",
-           [](BlockDesc &self, const py::bytes &byte_name,
-              const py::bytes &byte_name_new) {
+           [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
+              const pybind11::bytes &byte_name_new) {
              std::string name = byte_name;
              std::string new_name = byte_name_new;
              self.RenameVar(name, new_name);
            })
       .def("has_var_recursive",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.HasVarRecursive(name);
            })
       .def("find_var",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.FindVar(name);
            },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
       .def("find_var_recursive",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.FindVarRecursive(name);
            },
-           py::return_value_policy::reference)
-      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
-      .def("op_size", &BlockDesc::OpSize)
-      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
-      .def("serialize_to_string", SerializeMessage<BlockDesc>);
+           pybind11::return_value_policy::reference)
+      .def("remove_var",
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
+             std::string name = byte_name;
+             return self.RemoveVar(name);
+           },
+           pybind11::return_value_policy::reference)
+      .def("all_vars", &pd::BlockDesc::AllVars,
+           pybind11::return_value_policy::reference)
+      .def("op_size", &pd::BlockDesc::OpSize)
+      .def("op", &pd::BlockDesc::Op, pybind11::return_value_policy::reference)
+      .def("serialize_to_string", SerializeMessage<pd::BlockDesc>);
 }
 
-void BindVarDsec(py::module &m) {
-  py::class_<VarDesc> var_desc(m, "VarDesc", "");
+void BindVarDsec(pybind11::module *m) {
+  pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
   var_desc
       .def("name",
-           [](VarDesc &self) {
-             py::bytes name = self.Name();
+           [](pd::VarDesc &self) {
+             pybind11::bytes name = self.Name();
              return name;
            },
-           py::return_value_policy::reference)
-      .def("set_name", &VarDesc::SetName)
-      .def("set_shape", &VarDesc::SetShape)
-      .def("set_shapes", &VarDesc::SetShapes)
-      .def("set_dtype", &VarDesc::SetDataType)
-      .def("set_dtypes", &VarDesc::SetDataTypes)
-      .def("set_capacity", &VarDesc::SetCapacity)
-      .def("shape", &VarDesc::GetShape, py::return_value_policy::reference)
-      .def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference)
-      .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
-      .def("dtypes", &VarDesc::GetDataTypes, py::return_value_policy::reference)
-      .def("lod_level", &VarDesc::GetLoDLevel)
-      .def("lod_levels", &VarDesc::GetLoDLevels,
-           py::return_value_policy::reference)
-      .def("set_lod_level", &VarDesc::SetLoDLevel)
-      .def("set_lod_levels", &VarDesc::SetLoDLevels)
-      .def("type", &VarDesc::GetType)
-      .def("set_type", &VarDesc::SetType)
-      .def("serialize_to_string", SerializeMessage<VarDesc>)
-      .def("persistable", &VarDesc::Persistable)
-      .def("set_persistable", &VarDesc::SetPersistable);
+           pybind11::return_value_policy::reference)
+      .def("set_name", &pd::VarDesc::SetName)
+      .def("set_shape", &pd::VarDesc::SetShape)
+      .def("set_shapes", &pd::VarDesc::SetShapes)
+      .def("set_dtype", &pd::VarDesc::SetDataType)
+      .def("set_dtypes", &pd::VarDesc::SetDataTypes)
+      .def("set_capacity", &pd::VarDesc::SetCapacity)
+      .def("shape", &pd::VarDesc::GetShape,
+           pybind11::return_value_policy::reference)
+      .def("shapes", &pd::VarDesc::GetShapes,
+           pybind11::return_value_policy::reference)
+      .def("dtype", &pd::VarDesc::GetDataType,
+           pybind11::return_value_policy::reference)
+      .def("dtypes", &pd::VarDesc::GetDataTypes,
+           pybind11::return_value_policy::reference)
+      .def("lod_level", &pd::VarDesc::GetLoDLevel)
+      .def("lod_levels", &pd::VarDesc::GetLoDLevels,
+           pybind11::return_value_policy::reference)
+      .def("set_lod_level", &pd::VarDesc::SetLoDLevel)
+      .def("set_lod_levels", &pd::VarDesc::SetLoDLevels)
+      .def("type", &pd::VarDesc::GetType)
+      .def("set_type", &pd::VarDesc::SetType)
+      .def("serialize_to_string", SerializeMessage<pd::VarDesc>)
+      .def("persistable", &pd::VarDesc::Persistable)
+      .def("set_persistable", &pd::VarDesc::SetPersistable);
 
-  py::enum_<proto::VarType::Type>(var_desc, "VarType", "")
-      .value("BOOL", proto::VarType::BOOL)
-      .value("INT16", proto::VarType::INT16)
-      .value("INT32", proto::VarType::INT32)
-      .value("INT64", proto::VarType::INT64)
-      .value("FP16", proto::VarType::FP16)
-      .value("FP32", proto::VarType::FP32)
-      .value("FP64", proto::VarType::FP64)
-      .value("LOD_TENSOR", proto::VarType::LOD_TENSOR)
-      .value("SELECTED_ROWS", proto::VarType::SELECTED_ROWS)
-      .value("FEED_MINIBATCH", proto::VarType::FEED_MINIBATCH)
-      .value("FETCH_LIST", proto::VarType::FETCH_LIST)
-      .value("STEP_SCOPES", proto::VarType::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", proto::VarType::LOD_RANK_TABLE)
-      .value("LOD_TENSOR_ARRAY", proto::VarType::LOD_TENSOR_ARRAY)
-      .value("CHANNEL", proto::VarType::CHANNEL)
-      .value("PLACE_LIST", proto::VarType::PLACE_LIST)
-      .value("READER", proto::VarType::READER)
-      .value("RAW", proto::VarType::RAW);
+  pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
+      .value("BOOL", pd::proto::VarType::BOOL)
+      .value("INT16", pd::proto::VarType::INT16)
+      .value("INT32", pd::proto::VarType::INT32)
+      .value("INT64", pd::proto::VarType::INT64)
+      .value("FP16", pd::proto::VarType::FP16)
+      .value("FP32", pd::proto::VarType::FP32)
+      .value("FP64", pd::proto::VarType::FP64)
+      .value("LOD_TENSOR", pd::proto::VarType::LOD_TENSOR)
+      .value("SELECTED_ROWS", pd::proto::VarType::SELECTED_ROWS)
+      .value("FEED_MINIBATCH", pd::proto::VarType::FEED_MINIBATCH)
+      .value("FETCH_LIST", pd::proto::VarType::FETCH_LIST)
+      .value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES)
+      .value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE)
+      .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY)
+      .value("CHANNEL", pd::proto::VarType::CHANNEL)
+      .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST)
+      .value("READER", pd::proto::VarType::READER)
+      .value("RAW", pd::proto::VarType::RAW);
 }
 
-void BindOpDesc(py::module &m) {
-  py::enum_<proto::AttrType>(m, "AttrType", "")
-      .value("INT", proto::AttrType::INT)
-      .value("INTS", proto::AttrType::INTS)
-      .value("FLOAT", proto::AttrType::FLOAT)
-      .value("FLOATS", proto::AttrType::FLOATS)
-      .value("STRING", proto::AttrType::STRING)
-      .value("STRINGS", proto::AttrType::STRINGS)
-      .value("BOOL", proto::AttrType::BOOLEAN)
-      .value("BOOLS", proto::AttrType::BOOLEANS)
-      .value("BLOCK", proto::AttrType::BLOCK);
+void BindOpDesc(pybind11::module *m) {
+  pybind11::enum_<pd::proto::AttrType>(*m, "AttrType", "")
+      .value("INT", pd::proto::AttrType::INT)
+      .value("INTS", pd::proto::AttrType::INTS)
+      .value("FLOAT", pd::proto::AttrType::FLOAT)
+      .value("FLOATS", pd::proto::AttrType::FLOATS)
+      .value("STRING", pd::proto::AttrType::STRING)
+      .value("STRINGS", pd::proto::AttrType::STRINGS)
+      .value("BOOL", pd::proto::AttrType::BOOLEAN)
+      .value("BOOLS", pd::proto::AttrType::BOOLEANS)
+      .value("BLOCK", pd::proto::AttrType::BLOCK);
 
-  py::class_<OpDesc> op_desc(m, "OpDesc", "");
+  pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", "");
   op_desc
-      .def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
-           py::return_value_policy::reference)
-      .def("copy_from", &OpDesc::CopyFrom)
-      .def("type", &OpDesc::Type)
-      .def("set_type", &OpDesc::SetType)
-      .def("input", &OpDesc::Input)
-      .def("input_names", &OpDesc::InputNames)
-      .def("output", &OpDesc::Output)
-      .def("output_names", &OpDesc::OutputNames)
-      .def("set_input", &OpDesc::SetInput)
-      .def("set_output", &OpDesc::SetOutput)
-      .def("input_arg_names", &OpDesc::InputArgumentNames)
-      .def("output_arg_names", &OpDesc::OutputArgumentNames)
-      .def("rename_input", &OpDesc::RenameInput)
-      .def("rename_output", &OpDesc::RenameOutput)
-      .def("has_attr", &OpDesc::HasAttr)
-      .def("attr_type", &OpDesc::GetAttrType)
-      .def("attr_names", &OpDesc::AttrNames)
-      .def("set_attr", &OpDesc::SetAttr)
-      .def("attr", &OpDesc::GetAttr)
-      .def("set_block_attr", &OpDesc::SetBlockAttr)
+      .def("__init__", [](pd::OpDesc &self) { new (&self) pd::OpDesc(); },
+           pybind11::return_value_policy::reference)
+      .def("copy_from", &pd::OpDesc::CopyFrom)
+      .def("type", &pd::OpDesc::Type)
+      .def("set_type", &pd::OpDesc::SetType)
+      .def("input", &pd::OpDesc::Input)
+      .def("input_names", &pd::OpDesc::InputNames)
+      .def("output", &pd::OpDesc::Output)
+      .def("output_names", &pd::OpDesc::OutputNames)
+      .def("set_input", &pd::OpDesc::SetInput)
+      .def("set_output", &pd::OpDesc::SetOutput)
+      .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
+      .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
+      .def("rename_input", &pd::OpDesc::RenameInput)
+      .def("rename_output", &pd::OpDesc::RenameOutput)
+      .def("has_attr", &pd::OpDesc::HasAttr)
+      .def("attr_type", &pd::OpDesc::GetAttrType)
+      .def("attr_names", &pd::OpDesc::AttrNames)
+      .def("set_attr", &pd::OpDesc::SetAttr)
+      .def("attr", &pd::OpDesc::GetAttr)
+      .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
       .def("set_serialized_attr",
-           [](OpDesc &self, const std::string &name,
-              const py::bytes &seriralized) {
+           [](pd::OpDesc &self, const std::string &name,
+              const pybind11::bytes &seriralized) {
              std::string ser(seriralized);
              self.SetAttr(name, ser);
            })
-      .def("block_attr", &OpDesc::GetBlockAttr)
-      .def("check_attrs", &OpDesc::CheckAttrs)
-      .def("infer_shape", &OpDesc::InferShape)
-      .def("infer_var_type", &OpDesc::InferVarType)
-      .def("serialize_to_string", SerializeMessage<OpDesc>)
-      .def("block", &OpDesc::Block, py::return_value_policy::reference);
+      .def("block_attr", &pd::OpDesc::GetBlockAttr)
+      .def("check_attrs", &pd::OpDesc::CheckAttrs)
+      .def("infer_shape", &pd::OpDesc::InferShape)
+      .def("infer_var_type", &pd::OpDesc::InferVarType)
+      .def("serialize_to_string", SerializeMessage<pd::OpDesc>)
+      .def("block", &pd::OpDesc::Block,
+           pybind11::return_value_policy::reference);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/protobuf.h b/paddle/fluid/pybind/protobuf.h
index d0dc8936b3df50ca12315f113fbb36b0f98bb53f..e7370672a88fcf9238cc88c6aae65c6ee643746b 100644
--- a/paddle/fluid/pybind/protobuf.h
+++ b/paddle/fluid/pybind/protobuf.h
@@ -11,25 +11,25 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <Python.h>
+
 #include <fstream>
 #include <vector>
+
 #include "paddle/fluid/platform/variant.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
-namespace py = pybind11;
-
 namespace paddle {
 namespace pybind {
 
-void BindProgramDesc(py::module& m);
-void BindBlockDesc(py::module& m);
-void BindVarDsec(py::module& m);
-void BindOpDesc(py::module& m);
+void BindProgramDesc(pybind11::module* m);
+void BindBlockDesc(pybind11::module* m);
+void BindVarDsec(pybind11::module* m);
+void BindOpDesc(pybind11::module* m);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index e1b1bbec97985aa839c62a0a82b81b020faf0008..bd8446df6650f5fb1c62e5370fd48216dbf31e17 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -11,11 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <Python.h>
+#include <algorithm>
+#include <map>
+#include <mutex>  // NOLINT // for call_once
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
 
 #include "paddle/fluid/pybind/protobuf.h"
 
-#include <mutex>  // for call_once
-#include <unordered_map>
 #include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
@@ -32,7 +38,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/cond_op.h"
 #include "paddle/fluid/operators/net_op.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
@@ -69,7 +74,7 @@ PYBIND11_PLUGIN(core) {
   // not cause namespace pollution.
   using namespace paddle::framework;  // NOLINT
 
-  BindException(m);
+  BindException(&m);
 
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
       .def_buffer(
@@ -100,6 +105,14 @@ PYBIND11_PLUGIN(core) {
            [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<int>(place);
            })
+      .def("alloc_int",
+           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("alloc_float",
+           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
+             self.mutable_data<float>(place);
+           })
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
@@ -113,6 +126,12 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCUDATensorSetFromArray<int64_t>)
       .def("set", PyCUDATensorSetFromArray<bool>)
       .def("set", PyCUDATensorSetFromArray<uint16_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<float>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<int>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<double>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<int64_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<bool>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
       .def("set_float_element", TensorSetElement<float>)
@@ -317,7 +336,17 @@ All parameter, weight, gradient are variables in Paddle.
 #else
                     return new paddle::platform::CUDADeviceContext(place);
 #endif
-                  });
+                  })
+          .def_static("create",
+                [](paddle::platform::CUDAPinnedPlace& place)
+                        -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_CUDA
+                  PADDLE_THROW(
+                        "CUDAPinnedPlace is not supported in CPU device.");
+#else
+                  return new paddle::platform::CUDAPinnedDeviceContext(place);
+#endif
+                });;
 // clang-format on
 #ifdef PADDLE_WITH_CUDA
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
@@ -330,6 +359,10 @@ All parameter, weight, gradient are variables in Paddle.
       .def(py::init<>())
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
+  py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
+      .def(py::init<>())
+      .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
+
   py::class_<platform::Place>(m, "Place")
       .def(py::init<>())
       .def("set_place",
@@ -339,7 +372,11 @@ All parameter, weight, gradient are variables in Paddle.
       .def("set_place",
            [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
              self = gpu_place;
-           });
+           })
+      .def("set_place", [](platform::Place &self,
+                           const platform::CUDAPinnedPlace &cuda_pinned_place) {
+        self = cuda_pinned_place;
+      });
 
   py::class_<OperatorBase>(m, "Operator")
       .def_static("create",
@@ -363,6 +400,11 @@ All parameter, weight, gradient are variables in Paddle.
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CUDAPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::CUDAPinnedPlace &place) {
+             self.Run(scope, place);
+           })
       .def("type",
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
@@ -436,11 +478,11 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
 
-  BindProgramDesc(m);
-  BindBlockDesc(m);
-  BindVarDsec(m);
-  BindOpDesc(m);
-  BindConstValue(m);
+  BindProgramDesc(&m);
+  BindBlockDesc(&m);
+  BindVarDsec(&m);
+  BindOpDesc(&m);
+  BindConstValue(&m);
 
   py::class_<framework::LoDRankTable>(m, "LodRankTable")
       .def("items", [](framework::LoDRankTable &table) {
@@ -502,16 +544,23 @@ All parameter, weight, gradient are variables in Paddle.
            [](ParallelExecutor &self, size_t num_threads, bool use_event,
               const std::vector<platform::Place> &places,
               const std::unordered_set<std::string> &params,
-              const ProgramDesc &startup_program,
+              const std::unordered_set<std::string> &bcast_vars,
               const ProgramDesc &main_program, const std::string &loss_var_name,
-              Scope *scope) {
-             new (&self) ParallelExecutor(num_threads, use_event, places,
-                                          params, startup_program, main_program,
-                                          loss_var_name, scope);
+              Scope *scope, std::vector<Scope *> &local_scopes,
+              bool allow_op_delay) {
+             new (&self)
+                 ParallelExecutor(num_threads, use_event, places, params,
+                                  bcast_vars, main_program, loss_var_name,
+                                  scope, local_scopes, allow_op_delay);
            })
+      .def("local_scopes",
+           [](ParallelExecutor &self) -> std::vector<Scope *> * {
+             return &self.GetLocalScopes();
+           },
+           py::return_value_policy::reference)
       .def("run", &ParallelExecutor::Run);
 
-  BindRecordIOWriter(m);
+  BindRecordIOWriter(&m);
   return m.ptr();
 }
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc
index 16f8bfb1a2e3a840670594d3cc2970e690dce891..0644d91425af1a1ac9363b1dec9e317689331fcb 100644
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
@@ -13,13 +13,19 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/recordio.h"
+
 #include <fstream>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/recordio/writer.h"
 
 namespace paddle {
 namespace pybind {
 
+namespace {
+
 class RecordIOWriter {
  public:
   RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
@@ -49,8 +55,10 @@ class RecordIOWriter {
   recordio::Writer writer_;
 };
 
-void BindRecordIOWriter(py::module& m) {
-  py::class_<RecordIOWriter> writer(m, "RecordIOWriter", "");
+}  // namespace
+
+void BindRecordIOWriter(py::module* m) {
+  py::class_<RecordIOWriter> writer(*m, "RecordIOWriter", "");
   py::enum_<recordio::Compressor>(writer, "Compressor", "")
       .value("Snappy", recordio::Compressor::kSnappy)
       .value("NoCompress", recordio::Compressor::kNoCompress);
diff --git a/paddle/fluid/pybind/recordio.h b/paddle/fluid/pybind/recordio.h
index 60e6a9e8595614b38375fca8c13d520739af9aaf..2555f9b719af8f73fbac10d92b890afd99fac290 100644
--- a/paddle/fluid/pybind/recordio.h
+++ b/paddle/fluid/pybind/recordio.h
@@ -21,6 +21,7 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
-extern void BindRecordIOWriter(py::module& m);
+void BindRecordIOWriter(py::module* m);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 6f8c597f8e610594851c318c122563523e4e7ea6..4a9dbd324c90380e784cc9457845fabd858585be 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <Python.h>
 #include <string>
+#include <tuple>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -21,12 +24,8 @@ limitations under the License. */
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
-namespace py = pybind11;
-
 namespace paddle {
-
 namespace pybind {
-
 namespace details {
 
 template <bool less, size_t I, typename... ARGS>
@@ -34,16 +33,16 @@ struct CastToPyBufferImpl;
 
 template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<false, I, ARGS...> {
-  py::buffer_info operator()(framework::Tensor &tensor) {
+  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
     PADDLE_THROW("This type of tensor cannot be expose to Python");
-    return py::buffer_info();
+    return pybind11::buffer_info();
   }
 };
 
 template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
   using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
-  py::buffer_info operator()(framework::Tensor &tensor) {
+  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
     if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
       auto dim_vec = framework::vectorize(tensor.dims());
       std::vector<size_t> dims_outside;
@@ -82,15 +81,15 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 
       if (std::type_index(typeid(CUR_TYPE)) ==
           std::type_index(typeid(platform::float16))) {
-        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
-                               "e", /* np.dtype('e') == np.float16 */
-                               (size_t)framework::arity(dst_tensor.dims()),
-                               dims_outside, strides);
+        return pybind11::buffer_info(
+            dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+            "e", /* np.dtype('e') == np.float16 */
+            (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
       } else {
-        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
-                               py::format_descriptor<CUR_TYPE>::format(),
-                               (size_t)framework::arity(dst_tensor.dims()),
-                               dims_outside, strides);
+        return pybind11::buffer_info(
+            dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+            pybind11::format_descriptor<CUR_TYPE>::format(),
+            (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
       }
     } else {
       constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
@@ -101,7 +100,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 
 }  // namespace details
 
-inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
+inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
   auto buffer_info =
       details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
                                   platform::float16>()(tensor);
@@ -109,7 +108,7 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
 }
 
 template <typename T>
-T TensorGetElement(framework::Tensor &self, size_t offset) {
+T TensorGetElement(const framework::Tensor &self, size_t offset) {
   if (platform::is_cpu_place(self.place())) {
     return self.data<T>()[offset];
   } else {
@@ -121,64 +120,70 @@ T TensorGetElement(framework::Tensor &self, size_t offset) {
 
 // TODO(dzhwinter) : fix the redundent Tensor allocate and free
 template <typename T>
-void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
-  if (platform::is_gpu_place(self.place())) {
+void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
+  if (platform::is_gpu_place(self->place())) {
     std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
-    framework::TensorCopy(self, platform::CPUPlace(), dst.get());
+    framework::TensorCopy(*self, platform::CPUPlace(), dst.get());
     dst->data<T>()[offset] = elem;
-    framework::TensorCopy(*dst.get(), self.place(), &self);
+    framework::TensorCopy(*dst.get(), self->place(), self);
 
-  } else if (platform::is_cpu_place(self.place())) {
-    self.data<T>()[offset] = elem;
+  } else if (platform::is_cpu_place(self->place())) {
+    self->data<T>()[offset] = elem;
   }
 }
 
 template <typename T>
 void PyCPUTensorSetFromArray(
-    framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::CPUPlace &place) {
+    framework::Tensor *self,
+    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CPUPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
-  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(place);
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<T>(place);
   std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }
 
 template <>
+// This following specialization maps uint16_t in the parameter type to
+// platform::float16.
 void PyCPUTensorSetFromArray(
-    framework::Tensor &self,
-    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::CPUPlace &place) {
+    framework::Tensor *self,
+    pybind11::array_t<uint16_t,
+                      pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CPUPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
-  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<platform::float16>(place);
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<platform::float16>(place);
   std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
 }
 
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
 void PyCUDATensorSetFromArray(
-    framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::CUDAPlace &place) {
+    framework::Tensor *self,
+    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CUDAPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
-  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(place);
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<T>(place);
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto dev_ctx =
@@ -188,18 +193,22 @@ void PyCUDATensorSetFromArray(
 }
 
 template <>
+// This following specialization maps uint16_t in the parameter type to
+// platform::float16.
 void PyCUDATensorSetFromArray(
-    framework::Tensor &self,
-    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::CUDAPlace &place) {
+    framework::Tensor *self,
+    pybind11::array_t<uint16_t,
+                      pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CUDAPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
-  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<platform::float16>(place);
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<platform::float16>(place);
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto dev_ctx =
@@ -208,6 +217,43 @@ void PyCUDATensorSetFromArray(
                                    sizeof(uint16_t) * array.size(),
                                    cudaMemcpyHostToDevice, dev_ctx->stream());
 }
+
+template <typename T>
+void PyCUDAPinnedTensorSetFromArray(
+    framework::Tensor *self,
+    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    const paddle::platform::CUDAPinnedPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back(static_cast<int>(array.shape()[i]));
+  }
+
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<T>(place);
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+
+template <>
+// This following specialization maps uint16_t in the parameter type to
+// platform::float16.
+void PyCUDAPinnedTensorSetFromArray(
+    framework::Tensor *self,
+    pybind11::array_t<uint16_t,
+                      pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    const paddle::platform::CUDAPinnedPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back(static_cast<int>(array.shape()[i]));
+  }
+
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<platform::float16>(place);
+  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
+}
 #endif
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/tensor_py_test.cc b/paddle/fluid/pybind/tensor_py_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a0ae1d65833b1097bf69befe05884cab1317a89
--- /dev/null
+++ b/paddle/fluid/pybind/tensor_py_test.cc
@@ -0,0 +1,44 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/tensor_py.h"
+
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor.h"
+
+TEST(TensorPy, CastToPyBufferImpl) {
+  typedef int ElemType;
+
+  paddle::framework::Tensor t;
+  auto d = paddle::framework::make_ddim({1, 2, 3});
+  int* p = t.mutable_data<ElemType>(d, paddle::platform::CPUPlace());
+  for (int i = 0; i < paddle::framework::product(d); ++i) {
+    p[i] = i;
+  }
+
+  pybind11::buffer_info bi = paddle::pybind::CastToPyBuffer(t);
+  EXPECT_EQ(bi.itemsize, static_cast<size_t>(sizeof(ElemType)));
+  EXPECT_EQ(bi.size, static_cast<size_t>(paddle::framework::product(d)));
+  EXPECT_EQ(bi.ndim, static_cast<size_t>(3));  // 3-dimensional as d.
+  EXPECT_EQ(bi.shape.size(), 3U);              // as Dim d.
+  EXPECT_EQ(bi.shape[0], static_cast<size_t>(1));
+  EXPECT_EQ(bi.shape[1], static_cast<size_t>(2));
+  EXPECT_EQ(bi.shape[2], static_cast<size_t>(3));
+  EXPECT_EQ(bi.strides.size(), 3U);  // 3-dimensional as d.
+  EXPECT_EQ(bi.strides[2], static_cast<size_t>(sizeof(ElemType)));
+  EXPECT_EQ(bi.strides[1], static_cast<size_t>(sizeof(ElemType) * 3));
+  EXPECT_EQ(bi.strides[0], static_cast<size_t>(sizeof(ElemType) * 2 * 3));
+}
diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc
index 187a6a4ea7bd9d3a8ae48fa262e18f71b0f7d20d..e7ebbba452c5c37113f0962e459da65c66b70873 100644
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
@@ -14,11 +14,13 @@
 
 #include "paddle/fluid/recordio/chunk.h"
 
+#include <algorithm>
 #include <memory>
 #include <sstream>
+
 #include "paddle/fluid/platform/enforce.h"
-#include "snappystream.hpp"
-#include "zlib.h"
+#include "snappy_stream/include/snappystream.hpp"
+#include "zlib/include/zlib.h"
 
 namespace paddle {
 namespace recordio {
@@ -58,8 +60,8 @@ static void ReadStreamByBuf(std::istream& in, size_t limit, Callback callback) {
  * Copy stream in to another stream
  */
 static void PipeStream(std::istream& in, std::ostream& os) {
-  ReadStreamByBuf(
-      in, 0, [&os](const char* buf, size_t len) { os.write(buf, len); });
+  ReadStreamByBuf(in, 0,
+                  [&os](const char* buf, size_t len) { os.write(buf, len); });
 }
 
 /**
@@ -68,8 +70,8 @@ static void PipeStream(std::istream& in, std::ostream& os) {
 static uint32_t Crc32Stream(std::istream& in, size_t limit = 0) {
   uint32_t crc = static_cast<uint32_t>(crc32(0, nullptr, 0));
   ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) {
-    crc = static_cast<uint32_t>(crc32(
-        crc, reinterpret_cast<const Bytef*>(buf), static_cast<uInt>(len)));
+    crc = static_cast<uint32_t>(crc32(crc, reinterpret_cast<const Bytef*>(buf),
+                                      static_cast<uInt>(len)));
   });
   return crc;
 }
diff --git a/paddle/fluid/recordio/chunk.h b/paddle/fluid/recordio/chunk.h
index bf20ebd455c26ddeebeeea8db04cf7103b0c085f..71a1556a33bfa5c937d6a799d2818cd5a5ef2094 100644
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
@@ -24,7 +24,7 @@ namespace recordio {
 
 // A Chunk contains the Header and optionally compressed records.
 class Chunk {
-public:
+ public:
   Chunk() : num_bytes_(0) {}
   void Add(const std::string& buf) {
     num_bytes_ += buf.size();
@@ -46,7 +46,7 @@ public:
 
   bool Empty() const { return records_.empty(); }
 
-private:
+ private:
   std::vector<std::string> records_;
   // sum of record lengths in bytes.
   size_t num_bytes_;
diff --git a/paddle/fluid/recordio/chunk_test.cc b/paddle/fluid/recordio/chunk_test.cc
index 1f0e36a14d373ca96167199d4582bc8f17290ae8..98ca99b9a018db2da9aa563741ff3cf30461c4ce 100644
--- a/paddle/fluid/recordio/chunk_test.cc
+++ b/paddle/fluid/recordio/chunk_test.cc
@@ -18,29 +18,27 @@
 
 #include "gtest/gtest.h"
 
-using namespace paddle::recordio;
-
 TEST(Chunk, SaveLoad) {
-  Chunk ch;
+  paddle::recordio::Chunk ch;
   ch.Add(std::string("12345", 6));
   ch.Add(std::string("123", 4));
   std::stringstream ss;
-  ch.Write(ss, Compressor::kNoCompress);
+  ch.Write(ss, paddle::recordio::Compressor::kNoCompress);
   ss.seekg(0);
   ch.Parse(ss);
   ASSERT_EQ(ch.NumBytes(), 10U);
 }
 
 TEST(Chunk, Compressor) {
-  Chunk ch;
+  paddle::recordio::Chunk ch;
   ch.Add(std::string("12345", 6));
   ch.Add(std::string("123", 4));
   ch.Add(std::string("123", 4));
   ch.Add(std::string("123", 4));
   std::stringstream ss;
-  ch.Write(ss, Compressor::kSnappy);
+  ch.Write(ss, paddle::recordio::Compressor::kSnappy);
   std::stringstream ss2;
-  ch.Write(ss2, Compressor::kNoCompress);
+  ch.Write(ss2, paddle::recordio::Compressor::kNoCompress);
   ASSERT_LE(ss.tellp(), ss2.tellp());  // Compress should contain less data;
 
   ch.Clear();
diff --git a/paddle/fluid/recordio/header.h b/paddle/fluid/recordio/header.h
index 9200ac090de4514bef3704ac502039222eef2284..245425990b93a90d7ac6b233cff54feb48308d48 100644
--- a/paddle/fluid/recordio/header.h
+++ b/paddle/fluid/recordio/header.h
@@ -37,7 +37,7 @@ enum class Compressor : uint32_t {
 
 // Header is the metadata of Chunk
 class Header {
-public:
+ public:
   Header();
   Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs);
 
@@ -51,7 +51,7 @@ public:
   Compressor CompressType() const { return compressor_; }
   uint32_t CompressSize() const { return compress_size_; }
 
-private:
+ private:
   uint32_t num_records_;
   uint32_t checksum_;
   Compressor compressor_;
diff --git a/paddle/fluid/recordio/header_test.cc b/paddle/fluid/recordio/header_test.cc
index a7d627c3eb4a7af1954795f77e5f24739edadae8..00f1887dc5e1188829ef4cd42754d161f041656d 100644
--- a/paddle/fluid/recordio/header_test.cc
+++ b/paddle/fluid/recordio/header_test.cc
@@ -18,14 +18,12 @@
 
 #include "gtest/gtest.h"
 
-using namespace paddle::recordio;
-
 TEST(Recordio, ChunkHead) {
-  Header hdr(0, 1, Compressor::kGzip, 3);
+  paddle::recordio::Header hdr(0, 1, paddle::recordio::Compressor::kGzip, 3);
   std::stringstream ss;
   hdr.Write(ss);
   ss.seekg(0, std::ios::beg);
-  Header hdr2;
+  paddle::recordio::Header hdr2;
   hdr2.Parse(ss);
   EXPECT_TRUE(hdr == hdr2);
 }
diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc
index c22281dc97e05173ad76ce76959833b92f11c4ee..88b4d4001bc1b6dc935a9aabc2db5edfb55a60e4 100644
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
@@ -13,10 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/recordio/scanner.h"
+
+#include <string>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace recordio {
+
 Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
     : stream_(std::move(stream)) {
   Reset();
diff --git a/paddle/fluid/recordio/scanner.h b/paddle/fluid/recordio/scanner.h
index f3f17b69f195ddd92f5a39ead9755a7b8e2dd329..34f1b0c78d6b5af6072a993579e1866d38c6d009 100644
--- a/paddle/fluid/recordio/scanner.h
+++ b/paddle/fluid/recordio/scanner.h
@@ -16,12 +16,15 @@
 
 #include <fstream>
 #include <memory>
+#include <string>
+
 #include "paddle/fluid/recordio/chunk.h"
+
 namespace paddle {
 namespace recordio {
 
 class Scanner {
-public:
+ public:
   explicit Scanner(std::unique_ptr<std::istream>&& stream);
 
   explicit Scanner(const std::string& filename);
@@ -32,7 +35,7 @@ public:
 
   bool HasNext() const;
 
-private:
+ private:
   std::unique_ptr<std::istream> stream_;
   Chunk cur_chunk_;
   size_t offset_;
diff --git a/paddle/fluid/recordio/writer.cc b/paddle/fluid/recordio/writer.cc
index 196d66edff8cc6000afcd74fb945c05dcab7106a..8046f4ff7896c897ebe1de2e2bb231cad5a0e410 100644
--- a/paddle/fluid/recordio/writer.cc
+++ b/paddle/fluid/recordio/writer.cc
@@ -12,9 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/recordio/writer.h"
+
+#include <string>
+
 #include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace recordio {
+
 void Writer::Write(const std::string& record) {
   cur_chunk_.Add(record);
   if (cur_chunk_.NumRecords() >= max_num_records_in_chunk_) {
diff --git a/paddle/fluid/recordio/writer.h b/paddle/fluid/recordio/writer.h
index 0c478d507547b10b8ebaaf5e512557a5c8c13e65..ac7e50ee90e6e8671d68e0d8065e0cf06c819ad0 100644
--- a/paddle/fluid/recordio/writer.h
+++ b/paddle/fluid/recordio/writer.h
@@ -11,16 +11,17 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #pragma once
+
+#include <string>
+
 #include "paddle/fluid/recordio/chunk.h"
 namespace paddle {
 namespace recordio {
 
 class Writer {
-public:
-  Writer(std::ostream* sout,
-         Compressor compressor,
+ public:
+  Writer(std::ostream* sout, Compressor compressor,
          size_t max_num_records_in_chunk = 1000)
       : stream_(*sout),
         max_num_records_in_chunk_(max_num_records_in_chunk),
@@ -32,7 +33,7 @@ public:
 
   ~Writer();
 
-private:
+ private:
   std::ostream& stream_;
   size_t max_num_records_in_chunk_;
   Chunk cur_chunk_;
diff --git a/paddle/fluid/recordio/writer_scanner_test.cc b/paddle/fluid/recordio/writer_scanner_test.cc
index 7e764f0d9439709ad101af2b8864dc0158bd359b..6583df21a20e9e034adc14b1d3eeb136899d659e 100644
--- a/paddle/fluid/recordio/writer_scanner_test.cc
+++ b/paddle/fluid/recordio/writer_scanner_test.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "gtest/gtest.h"
-
 #include <sstream>
+#include <string>
+
+#include "gtest/gtest.h"
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
 
@@ -66,4 +67,4 @@ TEST(WriterScanner, TinyChunk) {
     ASSERT_EQ(scanner.Next(), "DEFG");
     ASSERT_FALSE(scanner.HasNext());
   }
-}
\ No newline at end of file
+}
diff --git a/paddle/fluid/string/.clang-format b/paddle/fluid/string/.clang-format
deleted file mode 120000
index 7d28cb3924707d39dafe20f4664fb17b5538996c..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../framework/.clang-format
\ No newline at end of file
diff --git a/paddle/fluid/string/piece.cc b/paddle/fluid/string/piece.cc
index 454f5d8d38c5f02598cddaab555334a1e8a398da..8e8cfb0e91389490895835ed09ef36adf756d3ca 100644
--- a/paddle/fluid/string/piece.cc
+++ b/paddle/fluid/string/piece.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "piece.h"
+#include "paddle/fluid/string/piece.h"
 
 #include <string.h>
 
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 693cf9d6dfeea0735801e64fe74b9770c258c553..062095a1c3e977c0bcc89346ead765acb023bcf7 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -71,6 +71,8 @@
 
 #include <iostream>
 #include <sstream>
+#include <string>
+
 #include "tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
 
 namespace paddle {
diff --git a/paddle/fluid/string/printf_test.cc b/paddle/fluid/string/printf_test.cc
index b6a60c8d6b7f15f8e5572cf5bb1e7f04ee1c1598..678029f93534ab374bd29083f8991d632ccdd5a1 100644
--- a/paddle/fluid/string/printf_test.cc
+++ b/paddle/fluid/string/printf_test.cc
@@ -11,7 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "printf.h"
+
+#include "paddle/fluid/string/printf.h"
 
 #include <string>
 
@@ -21,7 +22,7 @@ TEST(StringPrintf, StringPrintf) {
   std::string weekday = "Wednesday";
   const char* month = "July";
   size_t day = 27;
-  long hour = 14;
+  int hour = 14;
   int min = 44;
   EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
             paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
diff --git a/paddle/fluid/string/to_string_test.cc b/paddle/fluid/string/to_string_test.cc
index 8fc293af0e473994ac13f6615d3f6195c8c5f04c..1d9c0e5e0c2b6e7f44c1622d2828b21b0a4380ee 100644
--- a/paddle/fluid/string/to_string_test.cc
+++ b/paddle/fluid/string/to_string_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "to_string.h"
+#include "paddle/fluid/string/to_string.h"
 #include <gtest/gtest.h>
 
 constexpr char kOutputString[] = "User Defined Output";
@@ -26,14 +26,13 @@ std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
 }
 
 TEST(to_string, normal) {
-  using namespace paddle::string;
+  using paddle::string::to_string;
   ASSERT_EQ("10", to_string(10));
   ASSERT_EQ("abc", to_string("abc"));
   ASSERT_EQ("1.2", to_string(1.2));
 }
 
 TEST(to_string, user_defined) {
-  using namespace paddle::string;
   UserDefinedClass instance;
-  ASSERT_EQ(kOutputString, to_string(instance));
+  ASSERT_EQ(kOutputString, paddle::string::to_string(instance));
 }
diff --git a/paddle/gserver/layers/UpsampleLayer.cpp b/paddle/gserver/layers/UpsampleLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ff5332e6401acc3a28c9808fddd4812a7323544
--- /dev/null
+++ b/paddle/gserver/layers/UpsampleLayer.cpp
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+    limitations under the License. */
+
+#include "UpsampleLayer.h"
+#include "iostream"
+
+namespace paddle {
+
+REGISTER_LAYER(upsample, UpsampleLayer);
+
+size_t UpsampleLayer::getOutputSize() {
+  if (upsampleSize_ == 0) {
+    upsampleSize_ = imgSize_ * scale_ - static_cast<int>(padOutX_);
+    upsampleSizeY_ = imgSizeY_ * scaleY_ - static_cast<int>(padOutY_);
+  }
+  return upsampleSize_ * upsampleSizeY_ * channels_;
+}
+
+bool UpsampleLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+  CHECK_EQ(config_.inputs_size(), 2);
+  const auto& conf = config_.inputs(0).upsample_conf();
+  const auto& img_conf = conf.image_conf();
+
+  imgSizeY_ =
+      img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
+  imgSize_ = img_conf.img_size();
+  channels_ = img_conf.channels();
+
+  CHECK((conf.has_upsample_size()) || (conf.has_scale()))
+      << "scale or upsample_size is required.";
+
+  if (conf.has_upsample_size()) {
+    upsampleSize_ = conf.upsample_size();
+    upsampleSizeY_ = upsampleSize_;
+    if (conf.has_upsample_size_y()) {
+      upsampleSizeY_ = conf.upsample_size_y();
+    }
+  } else {
+    if (!conf.has_scale_y()) {
+      scale_ = scaleY_ = conf.scale_y();
+      CHECK_GT(static_cast<int>(scale_), 1);
+    } else {
+      scale_ = conf.scale();
+      scaleY_ = conf.scale_y();
+    }
+    padOutX_ = conf.pad_out_x();
+    padOutY_ = conf.pad_out_y();
+    CHECK(!padOutX_ || scale_ == 2)
+        << "Output height padding compensation requires scale_ == 2";
+    CHECK(!padOutY_ || scaleY_ == 2)
+        << "Output width padding compensation requires scaleY_ == 2";
+    upsampleSize_ = upsampleSizeY_ = 0;
+  }
+  return true;
+}
+
+void UpsampleLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr input = getInputValue(0);
+  MatrixPtr mask = inputLayers_[1]->getOutput("mask").value;
+
+  size_t batchSize = input->getHeight();
+  size_t outSize = getOutputSize();
+
+  CHECK_EQ(input->getWidth(), mask->getWidth());
+  CHECK_EQ(mask->getHeight(), batchSize);
+  resetOutput(batchSize, outSize);
+
+  MatrixPtr output = getOutputValue();
+  output->upsampleForward(*input,
+                          *mask,
+                          imgSize_,
+                          imgSizeY_,
+                          channels_,
+                          upsampleSize_,
+                          upsampleSizeY_);
+}
+
+void UpsampleLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr mask = inputLayers_[1]->getOutput("mask").value;
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+  inputGrad->upsampleBackward(*outputGrad,
+                              *mask,
+                              imgSize_,
+                              imgSizeY_,
+                              channels_,
+                              upsampleSize_,
+                              upsampleSizeY_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/UpsampleLayer.h b/paddle/gserver/layers/UpsampleLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..25efbac5e9e6e92653f7c2b2f4dca9221737e5d6
--- /dev/null
+++ b/paddle/gserver/layers/UpsampleLayer.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * This layer transpose the pooling process.
+ * It takes two input, the first input is the input data, and
+ * the second is the mask data from the max-pool-with-mask layer.
+ *
+ */
+
+class UpsampleLayer : public Layer {
+public:
+  explicit UpsampleLayer(const LayerConfig& config) : Layer(config) {}
+  ~UpsampleLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+  size_t getOutputSize();
+
+protected:
+  size_t scale_, scaleY_;
+  size_t upsampleSize_, upsampleSizeY_;
+  size_t padOutX_, padOutY_;
+  size_t imgSize_, imgSizeY_;
+  size_t channels_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index b578a906c2027a1169a0098b93f8d0742920f99d..9d7cad7584d1defefe38bdd4d041b98bd9e45bf0 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -14,6 +14,11 @@ function(gserver_test TARGET)
       COMMAND ${TARGET})
 endfunction()
 
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/concat_dotmul_a.conf
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_gserver_conf ALL DEPENDS concat_dotmul_a.conf)
+
 gserver_test(test_LayerGrad)
 gserver_test(test_CRFLayerGrad)
 gserver_test(test_CrossEntropyOverBeamGrad)
@@ -27,15 +32,16 @@ gserver_test(test_BatchNorm)
 gserver_test(test_KmaxSeqScore)
 gserver_test(test_Expand)
 gserver_test(test_MaxPoolingWithMaskOutput)
+gserver_test(test_Upsample)
 
 set(PYTHON_PATH 
    ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
+   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/gserver/tests)
 function(gserver_test_with_python TARGET)
   add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
   add_test(NAME ${TARGET}
     COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
 endfunction()
 
 gserver_test_with_python(test_PyDataProvider2)
@@ -56,7 +62,7 @@ if(WITH_MKLDNN)
         LayerGradUtil.cpp)
     add_test(NAME test_MKLDNN
         COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+            WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
 endif()
 
 ############### test_WarpCTCLayer #######################
@@ -65,7 +71,7 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
         test_WarpCTCLayer.cpp)
     add_test(NAME test_WarpCTCLayer
         COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
 endif()
 
 if(NOT MOBILE_INFERENCE)
@@ -83,15 +89,15 @@ if(NOT MOBILE_INFERENCE)
     endif()
     add_test(NAME test_NetworkCompare
         COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
 
     ############ test_CompareSparse ################
     add_unittest_without_exec(test_CompareSparse
         test_CompareSparse.cpp)
     if(NOT ON_TRAVIS)
       add_test(NAME test_CompareSparse
-        COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
+        COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 6
                 ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
     endif()
 endif()
diff --git a/paddle/gserver/tests/test_Upsample.cpp b/paddle/gserver/tests/test_Upsample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39b902fcc75e71007f855e4e258e54ed8d40f16b
--- /dev/null
+++ b/paddle/gserver/tests/test_Upsample.cpp
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/testing/TestUtil.h"
+
+void setPoolConfig(paddle::TestConfig* config,
+                   paddle::PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(1);
+
+  int kw = 2, kh = 2;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(2);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow =
+      paddle::outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh =
+      paddle::outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+paddle::LayerPtr doOneUpsampleTest(const paddle::MatrixPtr& inputMat,
+                                   const string& poolType,
+                                   bool use_gpu,
+                                   real* tempGradData) {
+  /* prepare maxPoolWithMaskLayer */
+  paddle::TestConfig config;
+  config.inputDefs.push_back({paddle::INPUT_DATA, "layer_0", 128, 0});
+  paddle::LayerInputConfig* input = config.layerConfig.add_inputs();
+  paddle::PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(8);
+  pool->set_img_size_y(8);
+  setPoolConfig(&config, pool, "max-pool-with-mask");
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  config.layerConfig.set_name("MaxPoolWithMask");
+
+  std::vector<paddle::DataLayerPtr> dataLayers;
+  paddle::LayerMap layerMap;
+  vector<paddle::Argument> datas;
+
+  initDataLayer(config,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "MaxPoolWithMask",
+                1,
+                false,
+                use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
+
+  FLAGS_use_gpu = use_gpu;
+  std::vector<paddle::ParameterPtr> parameters;
+  paddle::LayerPtr maxPoolingWithMaskOutputLayer;
+  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
+  maxPoolingWithMaskOutputLayer->forward(paddle::PASS_GC);
+
+  /* prepare the upsample layer */
+  paddle::LayerConfig upsampleLayerConfig;
+  upsampleLayerConfig.set_type("upsample");
+  paddle::LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
+  upsampleLayerConfig.add_inputs();
+
+  paddle::UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
+  upsampleConfig->set_scale(2);
+  paddle::ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
+  imageConfig->set_channels(2);
+  imageConfig->set_img_size(4);
+  imageConfig->set_img_size_y(4);
+  upsampleLayerConfig.set_size(2 * 8 * 8);
+  upsampleLayerConfig.set_name("upsample");
+
+  for (size_t i = 0; i < 2; i++) {
+    paddle::LayerInputConfig& inputTemp =
+        *(upsampleLayerConfig.mutable_inputs(i));
+    inputTemp.set_input_layer_name("MaxPoolWithMask");
+  }
+
+  paddle::LayerPtr upsampleLayer;
+  paddle::ParameterMap parameterMap;
+  upsampleLayer = paddle::Layer::create(upsampleLayerConfig);
+  layerMap[upsampleLayerConfig.name()] = upsampleLayer;
+  upsampleLayer->init(layerMap, parameterMap);
+  upsampleLayer->setNeedGradient(true);
+  upsampleLayer->forward(paddle::PASS_GC);
+  upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128);
+  upsampleLayer->backward();
+
+  return upsampleLayer;
+}
+
+TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
+  bool useGpu = false;
+  paddle::MatrixPtr inputMat;
+  paddle::MatrixPtr inputGPUMat;
+  paddle::MatrixPtr tempGradMat;
+
+  inputMat = paddle::Matrix::create(1, 128, false, useGpu);
+  inputMat->randomizeUniform();
+
+  tempGradMat = paddle::Matrix::create(1, 128, false, useGpu);
+  tempGradMat->randomizeUniform();
+  real* tempGradData = tempGradMat->getData();
+
+  paddle::LayerPtr upsampleLayerCPU =
+      doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData);
+
+#ifdef PADDLE_WITH_CUDA
+  useGpu = true;
+  real* data = inputMat->getData();
+  inputGPUMat = paddle::Matrix::create(1, 128, false, useGpu);
+  inputGPUMat->copyFrom(data, 128);
+  paddle::LayerPtr upsampleLayerGPU = doOneUpsampleTest(
+      inputGPUMat, "max-pool-with-mask", useGpu, tempGradData);
+  paddle::checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
+                           upsampleLayerGPU->getOutput("").value);
+
+  paddle::checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
+                           upsampleLayerGPU->getPrev(0)->getOutputGrad());
+#endif
+}
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 35359d4b5a8fb9715317257538a6e2e38fc16b60..0e84cb37392839d112448b0b3c12b042e7df838e 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1024,6 +1024,66 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
   LOG(INFO) << "the  diffCnt is " << diffCnt;
 }
 
+void GpuMatrix::upsampleForward(Matrix& input,
+                                Matrix& mask,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW) {
+  CHECK(input.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = input.getData();
+  real* maskData = mask.getData();
+  real* outData = data_;
+
+  size_t batch = input.getHeight();
+
+  CHECK(imgSizeH * imgSizeW * channels == input.getWidth());
+  CHECK(imgSizeH * imgSizeW * channels == mask.getWidth());
+  CHECK_EQ(batch, this->getHeight());
+  CHECK(width_ == outputH * outputW * channels);
+  hl_upsample_forward(inputData,
+                      maskData,
+                      batch,
+                      imgSizeH,
+                      imgSizeW,
+                      channels,
+                      outputH,
+                      outputW,
+                      outData);
+}
+
+void GpuMatrix::upsampleBackward(Matrix& outputGrad,
+                                 Matrix& mask,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW) {
+  CHECK(outputGrad.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
+
+  real* outputGradData = outputGrad.getData();
+  real* maskData = mask.getData();
+  real* inputGradData = data_;
+  size_t batch = outputGrad.getHeight();
+
+  CHECK(imgSizeH * imgSizeW == this->getWidth() / channels);
+  CHECK_EQ(batch, this->getHeight());
+  CHECK_EQ(channels * outputH * outputW, outputGrad.getWidth());
+  hl_upsample_backward(outputGradData,
+                       maskData,
+                       batch,
+                       imgSizeH,
+                       imgSizeW,
+                       channels,
+                       outputH,
+                       outputW,
+                       inputGradData);
+}
+
 void GpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
@@ -1986,6 +2046,72 @@ void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
   CHECK_EQ(info, 0);
 }
 
+void CpuMatrix::upsampleForward(Matrix& input,
+                                Matrix& mask,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW) {
+  real* inputData = input.getData();
+  real* maskData = mask.getData();
+  real* outData = data_;
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  size_t batch = input.getHeight();
+  CHECK(inLength == input.getWidth() / channels);
+  CHECK_EQ(batch, this->getHeight());
+  CHECK_EQ(channels * outLength, this->getWidth());
+
+  for (size_t k = 0; k < batch; k++) {
+    for (size_t c = 0; c < channels; c++) {
+      for (size_t i = 0; i < inLength; i++) {
+        size_t out_index = static_cast<int>(maskData[i]);
+        if (out_index >= outLength) {
+          LOG(FATAL) << "upsample index " << out_index << " out of range.";
+        }
+        outData[out_index] = inputData[i];
+      }
+      inputData += inLength;
+      maskData += inLength;
+      outData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::upsampleBackward(Matrix& outputGrad,
+                                 Matrix& mask,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW) {
+  real* outputGradData = outputGrad.getData();
+  real* maskData = mask.getData();
+  real* inputGradData = data_;
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  size_t batch = outputGrad.getHeight();
+  CHECK(inLength == this->getWidth() / channels);
+  CHECK_EQ(batch, this->getHeight());
+  CHECK_EQ(channels * outLength, outputGrad.getWidth());
+
+  for (size_t k = 0; k < batch; k++) {
+    for (size_t c = 0; c < channels; c++) {
+      for (size_t i = 0; i < inLength; i++) {
+        size_t out_index = static_cast<int>(maskData[i]);
+        if (out_index >= outLength) {
+          LOG(FATAL) << "upsample index " << out_index << " out of range.";
+        }
+        inputGradData[i] = outputGradData[out_index];
+      }
+      inputGradData += inLength;
+      maskData += inLength;
+      outputGradData += outLength;
+    }
+  }
+}
+
 void CpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 631e69edc1b0f5c4ef4a115d4bd5ea29fc418018..04e9614eabc47c4c661ace2106e8ca96f45a1d49 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -859,6 +859,26 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void upsampleForward(Matrix& input,
+                               Matrix& mask,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t outputH,
+                               size_t outputW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void upsampleBackward(Matrix& outputGrad,
+                                Matrix& mask,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
   /**
    * Pooling forward operation, pick out the largest element
    * in the sizeX of value, if the maskMatP is not NULL, it will
@@ -1420,6 +1440,22 @@ public:
 
   void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
+  void upsampleForward(Matrix& input,
+                       Matrix& mask,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t channels,
+                       size_t outputH,
+                       size_t outputW);
+
+  void upsampleBackward(Matrix& outputGrad,
+                        Matrix& mask,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t outputH,
+                        size_t outputW);
+
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,
@@ -1694,6 +1730,22 @@ public:
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
 
+  void upsampleForward(Matrix& input,
+                       Matrix& mask,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t channels,
+                       size_t outputH,
+                       size_t outputW);
+
+  void upsampleBackward(Matrix& outputGrad,
+                        Matrix& mask,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t outputH,
+                        size_t outputW);
+
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index f916295cd7bc762e2052553b321344845f504648..4885b74e6c6644704cff01dbf49975d6e87ce0c4 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -104,7 +104,9 @@ EOF
         # make install should also be test when unittest
         make install -j `nproc`
         pip install /usr/local/opt/paddle/share/wheels/*.whl
-        paddle version
+        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
+            paddle version
+        fi
     fi
 }
 
@@ -183,6 +185,14 @@ EOF
         NCCL_DEPS=""
     fi
 
+    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
+        PADDLE_VERSION="paddle version"
+        CMD='"paddle", "version"'
+    else
+        PADDLE_VERSION="true"
+        CMD='"true"'
+    fi
+
     cat >> /paddle/build/Dockerfile <<EOF
     ADD python/dist/*.whl /
     # run paddle version to install python packages first
@@ -192,7 +202,7 @@ EOF
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
-        paddle version && \
+        ${PADDLE_VERSION} && \
         ldconfig
     ${DOCKERFILE_CUDNN_DSO}
     ${DOCKERFILE_GPU_ENV}
@@ -200,7 +210,7 @@ EOF
     ADD go/cmd/pserver/pserver /usr/bin/
     ADD go/cmd/master/master /usr/bin/
     # default command shows the paddle version and exit
-    CMD ["paddle", "version"]
+    CMD [${CMD}]
 EOF
 }
 
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 77f84cd43bdf35ae6f54b0db2b5f720d24872878..a1f446817e0cbc1b4391398a82b0846d01bbec2c 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -6,6 +6,6 @@ if(WITH_TESTING)
   add_library(paddle_test_util STATIC TestUtil.cpp)
   add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
   if(NOT MOBILE_INFERENCE)
-    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags)
+    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init memory gtest gflags)
   endif()
 endif()
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index bd518d8598f5aa7c32298ed2110a96a2743536b3..12c9ea8cef79a6bdbd6e26c35612d0abbe00257b 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -1,11 +1,16 @@
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/sample_trainer_config.conf
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_trainer_conf ALL DEPENDS sample_trainer_config.conf)
+
 set(PYTHON_PATH 
    ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests)
+   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/trainer/tests)
 function(trainer_test TARGET)
   add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
   add_test(NAME ${TARGET}
     COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
 endfunction()
 
 trainer_test(test_Compare)
@@ -22,11 +27,11 @@ if(WITH_PYTHON)
   add_test(NAME test_TrainerOnePass
     COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port 
           ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
 endif()
 
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
   COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
         ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 7a4977935ede4878c07f4fb6ba0dd76bf50acd42..6292e7fa52cd86c71724d9fe84ea622e98ff1e08 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -2,8 +2,8 @@
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
 create_resources(${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.py
-  ${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c)
-set(UTIL_RES ${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c)
+  ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
+set(UTIL_RES ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
 
 if(APPLE)
     file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 556bcd1d7e60c27fece43de666e9531ab4203414..a075eeb83bda64133920f9ab0275eb6c0e0fb8c4 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -15,13 +15,14 @@ foreach(filename ${proto_filenames})
     get_filename_component(ABS_FIL ${filename} ABSOLUTE)
     get_filename_component(FIL_WE ${filename} NAME_WE)
     set(CUR_PROTO_GEN_PY
-            ${PADDLE_SOURCE_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
+            ${PADDLE_BINARY_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
     set(PROTO_GEN_PY
             ${CUR_PROTO_GEN_PY}
             ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
+            COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/proto
             COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-            ARGS "--python_out=${PADDLE_SOURCE_DIR}/python/paddle/proto"
+            ARGS "--python_out=${PADDLE_BINARY_DIR}/python/paddle/proto"
             "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
             DEPENDS ${ABS_FIL} protoc)
 endforeach()
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index d699984ff2d3345fc91b1c4080d7f16af1366a39..d78ee9c9f39ed09825dffdfa0a442c0ffac5958f 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -323,6 +323,16 @@ message ClipConfig {
   required double max = 2;
 }
 
+message UpsampleConfig {
+  required ImageConfig image_conf = 1;
+  optional uint32 scale = 2 [ default = 2 ];
+  optional uint32 scale_y = 3 [ default = 2 ];
+  optional bool pad_out_x = 4 [ default = false ];
+  optional bool pad_out_y = 5 [ default = false ];
+  optional uint32 upsample_size = 6;
+  optional uint32 upsample_size_y = 7;
+}
+
 message ROIPoolConfig {
   required uint32 pooled_width = 1;
   required uint32 pooled_height = 2;
@@ -359,6 +369,7 @@ message LayerInputConfig {
   optional ClipConfig clip_conf = 18;
   optional ScaleSubRegionConfig scale_sub_region_conf = 19;
   optional ROIPoolConfig roi_pool_conf = 20;
+  optional UpsampleConfig upsample_conf = 21;
 }
 
 message LayerConfig {
diff --git a/python/.gitignore b/python/.gitignore
index 1ba1d4c9b0301ed920f5303089e75dd3a8e4e3fa..53a2b7a76b0dd2d9095f9582540e455e2c1174e2 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -1,6 +1,7 @@
 *pyc
 build
 dist
+paddlepaddle.egg-info
 paddle.egg-info
 paddlepaddle_gpu.egg-info
 .idea
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b0242b20b8d0fd81e624447d56e47865e1bf6438..7cbd7f22bf2968b29dc0665e893101b892808b5e 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -47,14 +47,16 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
+add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
         DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
 
 
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND touch stub.cc
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
+    COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
@@ -73,13 +75,15 @@ add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 
 if (WITH_TESTING)
+  add_subdirectory(paddle/reader/tests)
+  add_subdirectory(paddle/dataset/tests)
   if(NOT WITH_FLUID_ONLY)
     add_subdirectory(paddle/trainer_config_helpers/tests)
     if (WITH_SWIG_PY)
       # enable v2 API unittest only when paddle swig api is compiled
       add_subdirectory(paddle/v2/tests)
-      add_subdirectory(paddle/v2/reader/tests)
       add_subdirectory(paddle/v2/plot/tests)
+      add_subdirectory(paddle/v2/reader/tests)
     endif()
   endif()
   add_subdirectory(paddle/fluid/tests)
diff --git a/python/paddle/.gitignore b/python/paddle/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..98527864664d32f798edc06a53131e8d5a068295
--- /dev/null
+++ b/python/paddle/.gitignore
@@ -0,0 +1 @@
+version.py
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 1030c94e16376c326cb8b32926b8c47625cd38f0..d1cf04161ae4444ebc7da7fbc20e37dafe6c0fb1 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -14,8 +14,14 @@
 try:
     from version import full_version as __version__
     from version import commit as __git_commit__
+
 except ImportError:
     import sys
     sys.stderr.write('''Warning with import paddle: you should not 
      import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
                      )
+
+import reader
+import dataset
+import batch
+batch = batch.batch
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..317cf037c69f8639e3760fbfce20565127794fcb
--- /dev/null
+++ b/python/paddle/batch.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['batch']
+
+
+def batch(reader, batch_size):
+    """
+    Create a batched reader.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param batch_size: size of each mini-batch
+    :type batch_size: int
+    :return: the batched reader.
+    :rtype: callable
+    """
+
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == batch_size:
+                yield b
+                b = []
+        if b:
+            yield b
+
+    return batch_reader
diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3315e826e82a33dfeb9c5223ce196cffb1ae7234
--- /dev/null
+++ b/python/paddle/dataset/__init__.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Dataset package.
+"""
+
+import mnist
+import imikolov
+import imdb
+import cifar
+import movielens
+import conll05
+import uci_housing
+import sentiment
+import wmt14
+import wmt16
+import mq2007
+import flowers
+import voc2012
+import image
+
+__all__ = [
+    'mnist',
+    'imikolov',
+    'imdb',
+    'cifar',
+    'movielens',
+    'conll05',
+    'sentiment',
+    'uci_housing',
+    'wmt14',
+    'wmt16',
+    'mq2007',
+    'flowers',
+    'voc2012',
+    'image',
+]
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f4dcbdab2fecf84a0a7042a48a8c8a9e5f880d
--- /dev/null
+++ b/python/paddle/dataset/cifar.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CIFAR dataset.
+
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.
+
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.
+
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.
+
+"""
+
+import cPickle
+import itertools
+import numpy
+import paddle.dataset.common
+import tarfile
+
+__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+
+def reader_creator(filename, sub_name):
+    def read_batch(batch):
+        data = batch['data']
+        labels = batch.get('labels', batch.get('fine_labels', None))
+        assert labels is not None
+        for sample, label in itertools.izip(data, labels):
+            yield (sample / 255.0).astype(numpy.float32), int(label)
+
+    def reader():
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
+
+            for name in names:
+                batch = cPickle.load(f.extractfile(name))
+                for item in read_batch(batch):
+                    yield item
+
+    return reader
+
+
+def train100():
+    """
+    CIFAR-100 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 99].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'train')
+
+
+def test100():
+    """
+    CIFAR-100 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'test')
+
+
+def train10():
+    """
+    CIFAR-10 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch')
+
+
+def test10():
+    """
+    CIFAR-10 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'test_batch')
+
+
+def fetch():
+    paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train100(), 1000, "cifar_train100")
+    paddle.dataset.common.convert(path, test100(), 1000, "cifar_test100")
+    paddle.dataset.common.convert(path, train10(), 1000, "cifar_train10")
+    paddle.dataset.common.convert(path, test10(), 1000, "cifar_test10")
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..68660601c161d2332b17b448fae089506238ba78
--- /dev/null
+++ b/python/paddle/dataset/common.py
@@ -0,0 +1,236 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import hashlib
+import os
+import errno
+import shutil
+import sys
+import importlib
+import paddle.dataset
+import cPickle
+import glob
+import cPickle as pickle
+
+__all__ = [
+    'DATA_HOME',
+    'download',
+    'md5file',
+    'split',
+    'cluster_files_reader',
+    'convert',
+]
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
+
+
+# When running unit tests, there could be multiple processes that
+# trying to create DATA_HOME directory simultaneously, so we cannot
+# use a if condition to check for the existence of the directory;
+# instead, we use the filesystem as the synchronization mechanism by
+# catching returned errors.
+def must_mkdirs(path):
+    try:
+        os.makedirs(DATA_HOME)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+        pass
+
+
+must_mkdirs(DATA_HOME)
+
+
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    f = open(fname, "rb")
+    for chunk in iter(lambda: f.read(4096), b""):
+        hash_md5.update(chunk)
+    f.close()
+    return hash_md5.hexdigest()
+
+
+def download(url, module_name, md5sum, save_name=None):
+    dirname = os.path.join(DATA_HOME, module_name)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    filename = os.path.join(dirname,
+                            url.split('/')[-1]
+                            if save_name is None else save_name)
+
+    retry = 0
+    retry_limit = 3
+    while not (os.path.exists(filename) and md5file(filename) == md5sum):
+        if os.path.exists(filename):
+            print "file md5", md5file(filename), md5sum
+        if retry < retry_limit:
+            retry += 1
+        else:
+            raise RuntimeError("Cannot download {0} within retry limit {1}".
+                               format(url, retry_limit))
+        print "Cache file %s not found, downloading %s" % (filename, url)
+        r = requests.get(url, stream=True)
+        total_length = r.headers.get('content-length')
+
+        if total_length is None:
+            with open(filename, 'w') as f:
+                shutil.copyfileobj(r.raw, f)
+        else:
+            with open(filename, 'w') as f:
+                dl = 0
+                total_length = int(total_length)
+                for data in r.iter_content(chunk_size=4096):
+                    dl += len(data)
+                    f.write(data)
+                    done = int(50 * dl / total_length)
+                    sys.stdout.write("\r[%s%s]" % ('=' * done,
+                                                   ' ' * (50 - done)))
+                    sys.stdout.flush()
+
+    return filename
+
+
+def fetch_all():
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.dataset)):
+        if "fetch" in dir(
+                importlib.import_module("paddle.dataset.%s" % module_name)):
+            getattr(
+                importlib.import_module("paddle.dataset.%s" % module_name),
+                "fetch")()
+
+
+def fetch_all_recordio(path):
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.dataset)):
+        if "convert" in dir(
+                importlib.import_module("paddle.dataset.%s" % module_name)) and \
+                not module_name == "common":
+            ds_path = os.path.join(path, module_name)
+            must_mkdirs(ds_path)
+            getattr(
+                importlib.import_module("paddle.dataset.%s" % module_name),
+                "convert")(ds_path)
+
+
+def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
+    """
+    you can call the function as:
+
+    split(paddle.dataset.cifar.train10(), line_count=1000,
+        suffix="imikolov-train-%05d.pickle")
+
+    the output files as:
+
+    |-imikolov-train-00000.pickle
+    |-imikolov-train-00001.pickle
+    |- ...
+    |-imikolov-train-00480.pickle
+
+    :param reader: is a reader creator
+    :param line_count: line count for each file
+    :param suffix: the suffix for the output files, should contain "%d"
+                means the id for each file. Default is "%05d.pickle"
+    :param dumper: is a callable function that dump object to file, this
+                function will be called as dumper(obj, f) and obj is the object
+                will be dumped, f is a file object. Default is cPickle.dump.
+    """
+    if not callable(dumper):
+        raise TypeError("dumper should be callable.")
+    lines = []
+    indx_f = 0
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i >= line_count and i % line_count == 0:
+            with open(suffix % indx_f, "w") as f:
+                dumper(lines, f)
+                lines = []
+                indx_f += 1
+    if lines:
+        with open(suffix % indx_f, "w") as f:
+            dumper(lines, f)
+
+
+def cluster_files_reader(files_pattern,
+                         trainer_count,
+                         trainer_id,
+                         loader=cPickle.load):
+    """
+    Create a reader that yield element from the given files, select
+    a file set according trainer count and trainer_id
+
+    :param files_pattern: the files which generating by split(...)
+    :param trainer_count: total trainer count
+    :param trainer_id: the trainer rank id
+    :param loader: is a callable function that load object from file, this
+                function will be called as loader(f) and f is a file object.
+                Default is cPickle.load
+    """
+
+    def reader():
+        if not callable(loader):
+            raise TypeError("loader should be callable.")
+        file_list = glob.glob(files_pattern)
+        file_list.sort()
+        my_file_list = []
+        for idx, fn in enumerate(file_list):
+            if idx % trainer_count == trainer_id:
+                print "append file: %s" % fn
+                my_file_list.append(fn)
+        for fn in my_file_list:
+            with open(fn, "r") as f:
+                lines = loader(f)
+                for line in lines:
+                    yield line
+
+    return reader
+
+
+def convert(output_path, reader, line_count, name_prefix):
+    import recordio
+    """
+    Convert data from reader to recordio format files.
+
+    :param output_path: directory in which output files will be saved.
+    :param reader: a data reader, from which the convert program will read
+                   data instances.
+    :param name_prefix: the name prefix of generated files.
+    :param max_lines_to_shuffle: the max lines numbers to shuffle before
+                                 writing.
+    """
+
+    assert line_count >= 1
+    indx_f = 0
+
+    def write_data(indx_f, lines):
+        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
+        writer = recordio.writer(filename)
+        for l in lines:
+            # FIXME(Yancey1989):
+            # dumps with protocol: pickle.HIGHEST_PROTOCOL
+            writer.write(cPickle.dumps(l))
+        writer.close()
+
+    lines = []
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i % line_count == 0 and i >= line_count:
+            write_data(indx_f, lines)
+            lines = []
+            indx_f += 1
+            continue
+
+    write_data(indx_f, lines)
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e94ce89892f8e6822c15fdc510805e75dfca988
--- /dev/null
+++ b/python/paddle/dataset/conll05.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Conll05 dataset.
+Paddle semantic role labeling Book and demo use this dataset as an example.
+Because Conll05 is not free in public, the default downloaded URL is test set
+of Conll05 (which is public). Users can change URL and MD5 to their Conll
+dataset. And a pre-trained word vector model based on Wikipedia corpus is used
+to initialize SRL model.
+"""
+
+import tarfile
+import gzip
+import itertools
+import paddle.dataset.common
+
+__all__ = ['test, get_dict', 'get_embedding', 'convert']
+
+DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
+DATA_MD5 = '387719152ae52d60422c016e92a742fc'
+WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
+VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
+TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
+EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
+
+UNK_IDX = 0
+
+
+def load_label_dict(filename):
+    d = dict()
+    tag_dict = set()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            if line.startswith("B-"):
+                tag_dict.add(line[2:])
+            elif line.startswith("I-"):
+                tag_dict.add(line[2:])
+        index = 0
+        for tag in tag_dict:
+            d["B-" + tag] = index
+            index += 1
+            d["I-" + tag] = index
+            index += 1
+        d["O"] = index
+    return d
+
+
+def load_dict(filename):
+    d = dict()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            d[line.strip()] = i
+    return d
+
+
+def corpus_reader(data_path, words_name, props_name):
+    """
+    Read one corpus. It returns an iterator. Each element of
+    this iterator is a tuple including sentence and labels. The sentence is
+    consist of a list of word IDs. The labels include a list of label IDs.
+    :return: a iterator of data.
+    :rtype: iterator
+    """
+
+    def reader():
+        tf = tarfile.open(data_path)
+        wf = tf.extractfile(words_name)
+        pf = tf.extractfile(props_name)
+        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
+                fileobj=pf) as props_file:
+            sentences = []
+            labels = []
+            one_seg = []
+            for word, label in itertools.izip(words_file, props_file):
+                word = word.strip()
+                label = label.strip().split()
+
+                if len(label) == 0:  # end of sentence
+                    for i in xrange(len(one_seg[0])):
+                        a_kind_lable = [x[i] for x in one_seg]
+                        labels.append(a_kind_lable)
+
+                    if len(labels) >= 1:
+                        verb_list = []
+                        for x in labels[0]:
+                            if x != '-':
+                                verb_list.append(x)
+
+                        for i, lbl in enumerate(labels[1:]):
+                            cur_tag = 'O'
+                            is_in_bracket = False
+                            lbl_seq = []
+                            verb_word = ''
+                            for l in lbl:
+                                if l == '*' and is_in_bracket == False:
+                                    lbl_seq.append('O')
+                                elif l == '*' and is_in_bracket == True:
+                                    lbl_seq.append('I-' + cur_tag)
+                                elif l == '*)':
+                                    lbl_seq.append('I-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') != -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') == -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = True
+                                else:
+                                    raise RuntimeError('Unexpected label: %s' %
+                                                       l)
+
+                            yield sentences, verb_list[i], lbl_seq
+
+                    sentences = []
+                    labels = []
+                    one_seg = []
+                else:
+                    sentences.append(word)
+                    one_seg.append(label)
+
+        pf.close()
+        wf.close()
+        tf.close()
+
+    return reader
+
+
+def reader_creator(corpus_reader,
+                   word_dict=None,
+                   predicate_dict=None,
+                   label_dict=None):
+    def reader():
+        for sentence, predicate, labels in corpus_reader():
+
+            sen_len = len(sentence)
+
+            verb_index = labels.index('B-V')
+            mark = [0] * len(labels)
+            if verb_index > 0:
+                mark[verb_index - 1] = 1
+                ctx_n1 = sentence[verb_index - 1]
+            else:
+                ctx_n1 = 'bos'
+
+            if verb_index > 1:
+                mark[verb_index - 2] = 1
+                ctx_n2 = sentence[verb_index - 2]
+            else:
+                ctx_n2 = 'bos'
+
+            mark[verb_index] = 1
+            ctx_0 = sentence[verb_index]
+
+            if verb_index < len(labels) - 1:
+                mark[verb_index + 1] = 1
+                ctx_p1 = sentence[verb_index + 1]
+            else:
+                ctx_p1 = 'eos'
+
+            if verb_index < len(labels) - 2:
+                mark[verb_index + 2] = 1
+                ctx_p2 = sentence[verb_index + 2]
+            else:
+                ctx_p2 = 'eos'
+
+            word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
+
+            ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+            pred_idx = [predicate_dict.get(predicate)] * sen_len
+            label_idx = [label_dict.get(w) for w in labels]
+
+            yield word_idx, ctx_n2_idx, ctx_n1_idx, \
+              ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
+
+    return reader
+
+
+def get_dict():
+    """
+    Get the word, verb and label dictionary of Wikipedia corpus.
+    """
+    word_dict = load_dict(
+        paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
+    verb_dict = load_dict(
+        paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
+    label_dict = load_label_dict(
+        paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
+    return word_dict, verb_dict, label_dict
+
+
+def get_embedding():
+    """
+    Get the trained word vector based on Wikipedia corpus.
+    """
+    return paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+
+
+def test():
+    """
+    Conll05 test set creator.
+
+    Because the training dataset is not free, the test dataset is used for
+    training. It returns a reader creator, each sample in the reader is nine
+    features, including sentence sequence, predicate, predicate context,
+    predicate context flag and tagged sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    word_dict, verb_dict, label_dict = get_dict()
+    reader = corpus_reader(
+        paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
+        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
+        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
+    return reader_creator(reader, word_dict, verb_dict, label_dict)
+
+
+def fetch():
+    paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+    paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, test(), 1000, "conl105_train")
+    paddle.dataset.common.convert(path, test(), 1000, "conl105_test")
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f082e33be3357fbe405ab1a1ef5e0e601108a363
--- /dev/null
+++ b/python/paddle/dataset/flowers.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module will download dataset from
+http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
+and parse train/test set intopaddle reader creators.
+
+This set contains images of flowers belonging to 102 different categories.
+The images were acquired by searching the web and taking pictures. There are a
+minimum of 40 images for each category.
+
+The database was used in:
+
+Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
+ number of classes.Proceedings of the Indian Conference on Computer Vision,
+Graphics and Image Processing (2008)
+http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
+
+"""
+import cPickle
+import itertools
+import functools
+from common import download
+import tarfile
+import scipy.io as scio
+from paddle.dataset.image import *
+from paddle.reader import *
+import os
+import numpy as np
+from multiprocessing import cpu_count
+__all__ = ['train', 'test', 'valid']
+
+DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
+LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
+SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
+DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
+LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
+SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+# In official 'readme', tstid is the flag of test data
+# and trnid is the flag of train data. But test data is more than train data.
+# So we exchange the train data and test data.
+TRAIN_FLAG = 'tstid'
+TEST_FLAG = 'trnid'
+VALID_FLAG = 'valid'
+
+
+def default_mapper(is_train, sample):
+    '''
+    map image bytes data to type needed by model input layer
+    '''
+    img, label = sample
+    img = load_image_bytes(img)
+    img = simple_transform(
+        img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
+    return img.flatten().astype('float32'), label
+
+
+train_mapper = functools.partial(default_mapper, True)
+test_mapper = functools.partial(default_mapper, False)
+
+
+def reader_creator(data_file,
+                   label_file,
+                   setid_file,
+                   dataset_name,
+                   mapper,
+                   buffered_size=1024,
+                   use_xmap=True):
+    '''
+    1. read images from tar file and
+        merge images into batch files in 102flowers.tgz_batch/
+    2. get a reader to read sample from batch file
+
+    :param data_file: downloaded data file
+    :type data_file: string
+    :param label_file: downloaded label file
+    :type label_file: string
+    :param setid_file: downloaded setid file containing information
+                        about how to split dataset
+    :type setid_file: string
+    :param dataset_name: data set name (tstid|trnid|valid)
+    :type dataset_name: string
+    :param mapper: a function to map image bytes data to type
+                    needed by model input layer
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: data reader
+    :rtype: callable
+    '''
+    labels = scio.loadmat(label_file)['labels'][0]
+    indexes = scio.loadmat(setid_file)[dataset_name][0]
+    img2label = {}
+    for i in indexes:
+        img = "jpg/image_%05d.jpg" % i
+        img2label[img] = labels[i - 1]
+    file_list = batch_images_from_tar(data_file, dataset_name, img2label)
+
+    def reader():
+        for file in open(file_list):
+            file = file.strip()
+            batch = None
+            with open(file, 'r') as f:
+                batch = cPickle.load(f)
+            data = batch['data']
+            labels = batch['label']
+            for sample, label in itertools.izip(data, batch['label']):
+                yield sample, int(label) - 1
+
+    if use_xmap:
+        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+    else:
+        return map_readers(mapper, reader)
+
+
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers training set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: train data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers test set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers validation set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def fetch():
+    download(DATA_URL, 'flowers', DATA_MD5)
+    download(LABEL_URL, 'flowers', LABEL_MD5)
+    download(SETID_URL, 'flowers', SETID_MD5)
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..9235c41e9eb95b25a0dc53a494a203e7a4525981
--- /dev/null
+++ b/python/paddle/dataset/image.py
@@ -0,0 +1,381 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains some common interfaces for image preprocess.
+Many users are confused about the image layout. We introduce
+the image layout as follows.
+
+- CHW Layout
+
+  - The abbreviations: C=channel, H=Height, W=Width
+  - The default layout of image opened by cv2 or PIL is HWC.
+    PaddlePaddle only supports the CHW layout. And CHW is simply
+    a transpose of HWC. It must transpose the input image.
+
+- Color format: RGB or BGR
+
+  OpenCV use BGR color format. PIL use RGB color format. Both
+  formats can be used for training. Noted that, the format should
+  be keep consistent between the training and inference peroid.
+"""
+import numpy as np
+try:
+    import cv2
+except ImportError:
+    cv2 = None
+import os
+import tarfile
+import cPickle
+
+__all__ = [
+    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
+    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
+    "batch_images_from_tar"
+]
+
+
+def batch_images_from_tar(data_file,
+                          dataset_name,
+                          img2label,
+                          num_per_batch=1024):
+    """
+    Read images from tar file and batch them into batch file.
+
+    :param data_file: path of image tar file
+    :type data_file: string
+    :param dataset_name: 'train','test' or 'valid'
+    :type dataset_name: string
+    :param img2label: a dic with image file name as key 
+                    and image's label as value
+    :type img2label: dic
+    :param num_per_batch: image number per batch file
+    :type num_per_batch: int
+    :return: path of list file containing paths of batch file
+    :rtype: string
+    """
+    batch_dir = data_file + "_batch"
+    out_path = "%s/%s" % (batch_dir, dataset_name)
+    meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
+
+    if os.path.exists(out_path):
+        return meta_file
+    else:
+        os.makedirs(out_path)
+
+    tf = tarfile.open(data_file)
+    mems = tf.getmembers()
+    data = []
+    labels = []
+    file_id = 0
+    for mem in mems:
+        if mem.name in img2label:
+            data.append(tf.extractfile(mem).read())
+            labels.append(img2label[mem.name])
+            if len(data) == num_per_batch:
+                output = {}
+                output['label'] = labels
+                output['data'] = data
+                cPickle.dump(
+                    output,
+                    open('%s/batch_%d' % (out_path, file_id), 'w'),
+                    protocol=cPickle.HIGHEST_PROTOCOL)
+                file_id += 1
+                data = []
+                labels = []
+    if len(data) > 0:
+        output = {}
+        output['label'] = labels
+        output['data'] = data
+        cPickle.dump(
+            output,
+            open('%s/batch_%d' % (out_path, file_id), 'w'),
+            protocol=cPickle.HIGHEST_PROTOCOL)
+
+    with open(meta_file, 'a') as meta:
+        for file in os.listdir(out_path):
+            meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n")
+    return meta_file
+
+
+def load_image_bytes(bytes, is_color=True):
+    """
+    Load an color or gray image from bytes array.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        with open('cat.jpg') as f:
+            im = load_image_bytes(f.read())
+
+    :param bytes: the input image bytes array.
+    :type bytes: str
+    :param is_color: If set is_color True, it will load and
+                     return a color image. Otherwise, it will
+                     load and return a gray image.
+    :type is_color: bool
+    """
+    flag = 1 if is_color else 0
+    file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
+    img = cv2.imdecode(file_bytes, flag)
+    return img
+
+
+def load_image(file, is_color=True):
+    """
+    Load an color or gray image from the file path.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+
+    :param file: the input image path.
+    :type file: string
+    :param is_color: If set is_color True, it will load and
+                     return a color image. Otherwise, it will
+                     load and return a gray image.
+    :type is_color: bool
+    """
+    # cv2.IMAGE_COLOR for OpenCV3
+    # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
+    # cv2.IMAGE_GRAYSCALE for OpenCV3
+    # cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version
+    # Here, use constant 1 and 0
+    # 1: COLOR, 0: GRAYSCALE
+    flag = 1 if is_color else 0
+    im = cv2.imread(file, flag)
+    return im
+
+
+def resize_short(im, size):
+    """ 
+    Resize an image so that the length of shorter edge is size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+        im = resize_short(im, 256)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the shorter edge size of image after resizing.
+    :type size: int
+    """
+    h, w = im.shape[:2]
+    h_new, w_new = size, size
+    if h > w:
+        h_new = size * h / w
+    else:
+        w_new = size * w / h
+    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+    return im
+
+
+def to_chw(im, order=(2, 0, 1)):
+    """
+    Transpose the input image order. The image layout is HWC format
+    opened by cv2 or PIL. Transpose the input image to CHW layout
+    according the order (2,0,1).
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+        im = resize_short(im, 256)
+        im = to_chw(im)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param order: the transposed order.
+    :type order: tuple|list 
+    """
+    assert len(im.shape) == len(order)
+    im = im.transpose(order)
+    return im
+
+
+def center_crop(im, size, is_color=True):
+    """
+    Crop the center of image with size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = center_crop(im, 224)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the cropping size.
+    :type size: int
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    """
+    h, w = im.shape[:2]
+    h_start = (h - size) / 2
+    w_start = (w - size) / 2
+    h_end, w_end = h_start + size, w_start + size
+    if is_color:
+        im = im[h_start:h_end, w_start:w_end, :]
+    else:
+        im = im[h_start:h_end, w_start:w_end]
+    return im
+
+
+def random_crop(im, size, is_color=True):
+    """
+    Randomly crop input image with size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = random_crop(im, 224)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the cropping size.
+    :type size: int
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    """
+    h, w = im.shape[:2]
+    h_start = np.random.randint(0, h - size + 1)
+    w_start = np.random.randint(0, w - size + 1)
+    h_end, w_end = h_start + size, w_start + size
+    if is_color:
+        im = im[h_start:h_end, w_start:w_end, :]
+    else:
+        im = im[h_start:h_end, w_start:w_end]
+    return im
+
+
+def left_right_flip(im, is_color=True):
+    """
+    Flip an image along the horizontal direction.
+    Return the flipped image.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = left_right_flip(im)
+    
+    :param im: input image with HWC layout or HW layout for gray image
+    :type im: ndarray
+    :param is_color: whether input image is color or not
+    :type is_color: bool
+    """
+    if len(im.shape) == 3 and is_color:
+        return im[:, ::-1, :]
+    else:
+        return im[:, ::-1]
+
+
+def simple_transform(im,
+                     resize_size,
+                     crop_size,
+                     is_train,
+                     is_color=True,
+                     mean=None):
+    """
+    Simply data argumentation for training. These operations include
+    resizing, croping and flipping.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = simple_transform(im, 256, 224, True)
+
+    :param im: The input image with HWC layout.
+    :type im: ndarray
+    :param resize_size: The shorter edge length of the resized image.
+    :type resize_size: int
+    :param crop_size: The cropping size.
+    :type crop_size: int
+    :param is_train: Whether it is training or not.
+    :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
+    """
+    im = resize_short(im, resize_size)
+    if is_train:
+        im = random_crop(im, crop_size, is_color=is_color)
+        if np.random.randint(2) == 0:
+            im = left_right_flip(im, is_color)
+    else:
+        im = center_crop(im, crop_size, is_color)
+        im = center_crop(im, crop_size, is_color=is_color)
+    if len(im.shape) == 3:
+        im = to_chw(im)
+
+    im = im.astype('float32')
+    if mean is not None:
+        mean = np.array(mean, dtype=np.float32)
+        # mean value, may be one value per channel 
+        if mean.ndim == 1 and is_color:
+            mean = mean[:, np.newaxis, np.newaxis]
+        elif mean.ndim == 1:
+            mean = mean
+        else:
+            # elementwise mean
+            assert len(mean.shape) == len(im)
+        im -= mean
+
+    return im
+
+
+def load_and_transform(filename,
+                       resize_size,
+                       crop_size,
+                       is_train,
+                       is_color=True,
+                       mean=None):
+    """
+    Load image from the input file `filename` and transform image for
+    data argumentation. Please refer to the `simple_transform` interface
+    for the transform operations.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_and_transform('cat.jpg', 256, 224, True)
+
+    :param filename: The file name of input image.
+    :type filename: string
+    :param resize_size: The shorter edge length of the resized image.
+    :type resize_size: int
+    :param crop_size: The cropping size.
+    :type crop_size: int
+    :param is_train: Whether it is training or not.
+    :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
+    """
+    im = load_image(filename, is_color)
+    im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
+    return im
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ff05b1e9b7f4c42909370a21beb140ecdcd6868
--- /dev/null
+++ b/python/paddle/dataset/imdb.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+IMDB dataset.
+
+This module downloads IMDB dataset from
+http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
+of 25,000 highly polar movie reviews for training, and 25,000 for testing.
+Besides, this module also provides API for building dictionary.
+"""
+
+import paddle.dataset.common
+import collections
+import tarfile
+import re
+import string
+
+__all__ = ['build_dict', 'train', 'test', 'convert']
+
+URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
+MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
+
+
+def tokenize(pattern):
+    """
+    Read files that match the given pattern.  Tokenize and yield each file.
+    """
+
+    with tarfile.open(paddle.dataset.common.download(URL, 'imdb', MD5)) as tarf:
+        # Note that we should use tarfile.next(), which does
+        # sequential access of member files, other than
+        # tarfile.extractfile, which does random access and might
+        # destroy hard disks.
+        tf = tarf.next()
+        while tf != None:
+            if bool(pattern.match(tf.name)):
+                # newline and punctuations removal and ad-hoc tokenization.
+                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
+                    None, string.punctuation).lower().split()
+            tf = tarf.next()
+
+
+def build_dict(pattern, cutoff):
+    """
+    Build a word dictionary from the corpus. Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    word_freq = collections.defaultdict(int)
+    for doc in tokenize(pattern):
+        for word in doc:
+            word_freq[word] += 1
+
+    # Not sure if we should prune less-frequent words here.
+    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+
+    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+    words, _ = list(zip(*dictionary))
+    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx['<unk>'] = len(words)
+    return word_idx
+
+
+def reader_creator(pos_pattern, neg_pattern, word_idx):
+    UNK = word_idx['<unk>']
+    INS = []
+
+    def load(pattern, out, label):
+        for doc in tokenize(pattern):
+            out.append(([word_idx.get(w, UNK) for w in doc], label))
+
+    load(pos_pattern, INS, 0)
+    load(neg_pattern, INS, 1)
+
+    def reader():
+        for doc, label in INS:
+            yield doc, label
+
+    return reader
+
+
+def train(word_idx):
+    """
+    IMDB training set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("aclImdb/train/pos/.*\.txt$"),
+        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
+
+
+def test(word_idx):
+    """
+    IMDB test set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("aclImdb/test/pos/.*\.txt$"),
+        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
+
+
+def word_dict():
+    """
+    Build a word dictionary from the corpus.
+
+    :return: Word dictionary
+    :rtype: dict
+    """
+    return build_dict(
+        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+
+
+def fetch():
+    paddle.dataset.common.download(URL, 'imdb', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    w = word_dict()
+    paddle.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
+    paddle.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6c0a0f54373dd068b2c493f6fbc9c8578593aef
--- /dev/null
+++ b/python/paddle/dataset/imikolov.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+imikolov's simple dataset.
+
+This module will download dataset from 
+http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
+into paddle reader creators.
+"""
+import paddle.dataset.common
+import collections
+import tarfile
+
+__all__ = ['train', 'test', 'build_dict', 'convert']
+
+URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
+MD5 = '30177ea32e27c525793142b6bf2c8e2d'
+
+
+class DataType(object):
+    NGRAM = 1
+    SEQ = 2
+
+
+def word_count(f, word_freq=None):
+    if word_freq is None:
+        word_freq = collections.defaultdict(int)
+
+    for l in f:
+        for w in l.strip().split():
+            word_freq[w] += 1
+        word_freq['<s>'] += 1
+        word_freq['<e>'] += 1
+
+    return word_freq
+
+
+def build_dict(min_word_freq=50):
+    """
+    Build a word dictionary from the corpus,  Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    train_filename = './simple-examples/data/ptb.train.txt'
+    test_filename = './simple-examples/data/ptb.valid.txt'
+    with tarfile.open(
+            paddle.dataset.common.download(paddle.dataset.imikolov.URL,
+                                           'imikolov',
+                                           paddle.dataset.imikolov.MD5)) as tf:
+        trainf = tf.extractfile(train_filename)
+        testf = tf.extractfile(test_filename)
+        word_freq = word_count(testf, word_count(trainf))
+        if '<unk>' in word_freq:
+            # remove <unk> for now, since we will set it as last index
+            del word_freq['<unk>']
+
+        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
+
+        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+        words, _ = list(zip(*word_freq_sorted))
+        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx['<unk>'] = len(words)
+
+    return word_idx
+
+
+def reader_creator(filename, word_idx, n, data_type):
+    def reader():
+        with tarfile.open(
+                paddle.dataset.common.download(
+                    paddle.dataset.imikolov.URL, 'imikolov',
+                    paddle.dataset.imikolov.MD5)) as tf:
+            f = tf.extractfile(filename)
+
+            UNK = word_idx['<unk>']
+            for l in f:
+                if DataType.NGRAM == data_type:
+                    assert n > -1, 'Invalid gram length'
+                    l = ['<s>'] + l.strip().split() + ['<e>']
+                    if len(l) >= n:
+                        l = [word_idx.get(w, UNK) for w in l]
+                        for i in range(n, len(l) + 1):
+                            yield tuple(l[i - n:i])
+                elif DataType.SEQ == data_type:
+                    l = l.strip().split()
+                    l = [word_idx.get(w, UNK) for w in l]
+                    src_seq = [word_idx['<s>']] + l
+                    trg_seq = l + [word_idx['<e>']]
+                    if n > 0 and len(src_seq) > n: continue
+                    yield src_seq, trg_seq
+                else:
+                    assert False, 'Unknow data type'
+
+    return reader
+
+
+def train(word_idx, n, data_type=DataType.NGRAM):
+    """
+    imikolov training set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size if type is ngram, otherwise max length of sequence
+    :type n: int
+    :param data_type: data type (ngram or sequence)
+    :type data_type: member variable of DataType (NGRAM or SEQ)
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n,
+                          data_type)
+
+
+def test(word_idx, n, data_type=DataType.NGRAM):
+    """
+    imikolov test set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size if type is ngram, otherwise max length of sequence
+    :type n: int
+    :param data_type: data type (ngram or sequence)
+    :type data_type: member variable of DataType (NGRAM or SEQ)
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n,
+                          data_type)
+
+
+def fetch():
+    paddle.dataset.common.download(URL, "imikolov", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    N = 5
+    word_dict = build_dict()
+    paddle.dataset.common.convert(path,
+                                  train(word_dict, N), 1000, "imikolov_train")
+    paddle.dataset.common.convert(path,
+                                  test(word_dict, N), 1000, "imikolov_test")
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a1b8b5fac223c0d134cae69a61a0c2c00bc1feb
--- /dev/null
+++ b/python/paddle/dataset/mnist.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MNIST dataset.
+
+This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
+parse training set and test set into paddle reader creators.
+"""
+import paddle.dataset.common
+import subprocess
+import numpy
+import platform
+__all__ = ['train', 'test', 'convert']
+
+URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
+TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
+TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
+TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
+TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
+TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
+TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
+TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
+TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
+
+
+def reader_creator(image_filename, label_filename, buffer_size):
+    def reader():
+        if platform.system() == 'Darwin':
+            zcat_cmd = 'gzcat'
+        elif platform.system() == 'Linux':
+            zcat_cmd = 'zcat'
+        else:
+            raise NotImplementedError()
+
+        # According to http://stackoverflow.com/a/38061619/724872, we
+        # cannot use standard package gzip here.
+        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
+        m.stdout.read(16)  # skip some magic bytes
+
+        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
+        l.stdout.read(8)  # skip some magic bytes
+
+        try:  # reader could be break.
+            while True:
+                labels = numpy.fromfile(
+                    l.stdout, 'ubyte', count=buffer_size).astype("int")
+
+                if labels.size != buffer_size:
+                    break  # numpy.fromfile returns empty slice after EOF.
+
+                images = numpy.fromfile(
+                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
+                        (buffer_size, 28 * 28)).astype('float32')
+
+                images = images / 255.0 * 2.0 - 1.0
+
+                for i in xrange(buffer_size):
+                    yield images[i, :], int(labels[i])
+        finally:
+            m.terminate()
+            l.terminate()
+
+    return reader
+
+
+def train():
+    """
+    MNIST training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
+                                       TRAIN_IMAGE_MD5),
+        paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
+                                       TRAIN_LABEL_MD5), 100)
+
+
+def test():
+    """
+    MNIST test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5),
+        paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5),
+        100)
+
+
+def fetch():
+    paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
+    paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
+    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train(), 1000, "minist_train")
+    paddle.dataset.common.convert(path, test(), 1000, "minist_test")
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab11716202a8298c182e23b661eb1d2ac74bf4da
--- /dev/null
+++ b/python/paddle/dataset/movielens.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Movielens 1-M dataset.
+
+Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
+movies, which was collected by GroupLens Research. This module will download
+Movielens 1-M dataset from 
+http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
+set and test set into paddle reader creators.
+
+"""
+
+import zipfile
+import paddle.dataset.common
+import re
+import random
+import functools
+
+__all__ = [
+    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
+    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
+    'convert'
+]
+
+age_table = [1, 18, 25, 35, 45, 50, 56]
+
+URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
+MD5 = 'c4d9eecfca2ab87c1945afe126590906'
+
+
+class MovieInfo(object):
+    """
+    Movie id, title and categories information are stored in MovieInfo.
+    """
+
+    def __init__(self, index, categories, title):
+        self.index = int(index)
+        self.categories = categories
+        self.title = title
+
+    def value(self):
+        """
+        Get information from a movie.
+        """
+        return [
+            self.index, [CATEGORIES_DICT[c] for c in self.categories],
+            [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
+        ]
+
+    def __str__(self):
+        return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
+            self.index, self.title, self.categories)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class UserInfo(object):
+    """
+    User id, gender, age, and job information are stored in UserInfo.
+    """
+
+    def __init__(self, index, gender, age, job_id):
+        self.index = int(index)
+        self.is_male = gender == 'M'
+        self.age = age_table.index(int(age))
+        self.job_id = int(job_id)
+
+    def value(self):
+        """
+        Get information from a user.
+        """
+        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
+
+    def __str__(self):
+        return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
+            self.index, "M"
+            if self.is_male else "F", age_table[self.age], self.job_id)
+
+    def __repr__(self):
+        return str(self)
+
+
+MOVIE_INFO = None
+MOVIE_TITLE_DICT = None
+CATEGORIES_DICT = None
+USER_INFO = None
+
+
+def __initialize_meta_info__():
+    fn = paddle.dataset.common.download(URL, "movielens", MD5)
+    global MOVIE_INFO
+    if MOVIE_INFO is None:
+        pattern = re.compile(r'^(.*)\((\d+)\)$')
+        with zipfile.ZipFile(file=fn) as package:
+            for info in package.infolist():
+                assert isinstance(info, zipfile.ZipInfo)
+                MOVIE_INFO = dict()
+                title_word_set = set()
+                categories_set = set()
+                with package.open('ml-1m/movies.dat') as movie_file:
+                    for i, line in enumerate(movie_file):
+                        movie_id, title, categories = line.strip().split('::')
+                        categories = categories.split('|')
+                        for c in categories:
+                            categories_set.add(c)
+                        title = pattern.match(title).group(1)
+                        MOVIE_INFO[int(movie_id)] = MovieInfo(
+                            index=movie_id, categories=categories, title=title)
+                        for w in title.split():
+                            title_word_set.add(w.lower())
+
+                global MOVIE_TITLE_DICT
+                MOVIE_TITLE_DICT = dict()
+                for i, w in enumerate(title_word_set):
+                    MOVIE_TITLE_DICT[w] = i
+
+                global CATEGORIES_DICT
+                CATEGORIES_DICT = dict()
+                for i, c in enumerate(categories_set):
+                    CATEGORIES_DICT[c] = i
+
+                global USER_INFO
+                USER_INFO = dict()
+                with package.open('ml-1m/users.dat') as user_file:
+                    for line in user_file:
+                        uid, gender, age, job, _ = line.strip().split("::")
+                        USER_INFO[int(uid)] = UserInfo(
+                            index=uid, gender=gender, age=age, job_id=job)
+    return fn
+
+
+def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
+    fn = __initialize_meta_info__()
+    rand = random.Random(x=rand_seed)
+    with zipfile.ZipFile(file=fn) as package:
+        with package.open('ml-1m/ratings.dat') as rating:
+            for line in rating:
+                if (rand.random() < test_ratio) == is_test:
+                    uid, mov_id, rating, _ = line.strip().split("::")
+                    uid = int(uid)
+                    mov_id = int(mov_id)
+                    rating = float(rating) * 2 - 5.0
+
+                    mov = MOVIE_INFO[mov_id]
+                    usr = USER_INFO[uid]
+                    yield usr.value() + mov.value() + [[rating]]
+
+
+def __reader_creator__(**kwargs):
+    return lambda: __reader__(**kwargs)
+
+
+train = functools.partial(__reader_creator__, is_test=False)
+test = functools.partial(__reader_creator__, is_test=True)
+
+
+def get_movie_title_dict():
+    """
+    Get movie title dictionary.
+    """
+    __initialize_meta_info__()
+    return MOVIE_TITLE_DICT
+
+
+def __max_index_info__(a, b):
+    if a.index > b.index:
+        return a
+    else:
+        return b
+
+
+def max_movie_id():
+    """
+    Get the maximum value of movie id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
+
+
+def max_user_id():
+    """
+    Get the maximum value of user id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
+
+
+def __max_job_id_impl__(a, b):
+    if a.job_id > b.job_id:
+        return a
+    else:
+        return b
+
+
+def max_job_id():
+    """
+    Get the maximum value of job id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
+
+
+def movie_categories():
+    """
+    Get movie categoriges dictionary.
+    """
+    __initialize_meta_info__()
+    return CATEGORIES_DICT
+
+
+def user_info():
+    """
+    Get user info dictionary.
+    """
+    __initialize_meta_info__()
+    return USER_INFO
+
+
+def movie_info():
+    """
+    Get movie info dictionary.
+    """
+    __initialize_meta_info__()
+    return MOVIE_INFO
+
+
+def unittest():
+    for train_count, _ in enumerate(train()()):
+        pass
+    for test_count, _ in enumerate(test()()):
+        pass
+
+    print train_count, test_count
+
+
+def fetch():
+    paddle.dataset.common.download(URL, "movielens", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train(), 1000, "movielens_train")
+    paddle.dataset.common.convert(path, test(), 1000, "movielens_test")
+
+
+if __name__ == '__main__':
+    unittest()
diff --git a/python/paddle/dataset/mq2007.py b/python/paddle/dataset/mq2007.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3b3dd524c34be660c5f2d4fc5ce2fa0420efbc1
--- /dev/null
+++ b/python/paddle/dataset/mq2007.py
@@ -0,0 +1,333 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MQ2007 dataset
+
+MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross
+validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set,
+validation set and testing set.
+
+MQ2007 dataset from website
+http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators
+
+"""
+
+import os
+import functools
+import rarfile
+from common import download
+import numpy as np
+
+# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
+URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar"
+MD5 = "7be1640ae95c6408dab0ae7207bdc706"
+
+
+def __initialize_meta_info__():
+    """
+  download and extract the MQ2007 dataset
+  """
+    fn = fetch()
+    rar = rarfile.RarFile(fn)
+    dirpath = os.path.dirname(fn)
+    rar.extractall(path=dirpath)
+    return dirpath
+
+
+class Query(object):
+    """
+  queries used for learning to rank algorithms. It is created from relevance scores,  query-document feature vectors
+
+  Parameters:
+  ----------
+  query_id : int
+    query_id in dataset, mapping from query to relevance documents
+  relevance_score : int 
+    relevance score of query and document pair
+  feature_vector : array, dense feature
+    feature in vector format
+  description : string
+    comment section in query doc pair data
+  """
+
+    def __init__(self,
+                 query_id=-1,
+                 relevance_score=-1,
+                 feature_vector=None,
+                 description=""):
+        self.query_id = query_id
+        self.relevance_score = relevance_score
+        if feature_vector is None:
+            self.feature_vector = []
+        else:
+            self.feature_vector = feature_vector
+        self.description = description
+
+    def __str__(self):
+        string = "%s %s %s" % (str(self.relevance_score), str(self.query_id),
+                               " ".join(str(f) for f in self.feature_vector))
+        return string
+
+    # @classmethod
+    def _parse_(self, text):
+        """
+    parse line into Query
+    """
+        comment_position = text.find('#')
+        line = text[:comment_position].strip()
+        self.description = text[comment_position + 1:].strip()
+        parts = line.split()
+        if len(parts) != 48:
+            sys.stdout.write("expect 48 space split parts, get %d" %
+                             (len(parts)))
+            return None
+        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
+        self.relevance_score = int(parts[0])
+        self.query_id = int(parts[1].split(':')[1])
+        for p in parts[2:]:
+            pair = p.split(':')
+            self.feature_vector.append(float(pair[1]))
+        return self
+
+
+class QueryList(object):
+    """
+  group query into list, every item in list is a Query
+  """
+
+    def __init__(self, querylist=None):
+        self.query_id = -1
+        if querylist is None:
+            self.querylist = []
+        else:
+            self.querylist = querylist
+            for query in self.querylist:
+                if self.query_id == -1:
+                    self.query_id = query.query_id
+                else:
+                    if self.query_id != query.query_id:
+                        raise ValueError("query in list must be same query_id")
+
+    def __iter__(self):
+        for query in self.querylist:
+            yield query
+
+    def __len__(self):
+        return len(self.querylist)
+
+    def __getitem__(self, i):
+        return self.querylist[i]
+
+    def _correct_ranking_(self):
+        if self.querylist is None:
+            return
+        self.querylist.sort(key=lambda x: x.relevance_score, reverse=True)
+
+    def _add_query(self, query):
+        if self.query_id == -1:
+            self.query_id = query.query_id
+        else:
+            if self.query_id != query.query_id:
+                raise ValueError("query in list must be same query_id")
+        self.querylist.append(query)
+
+
+def gen_plain_txt(querylist):
+    """
+  gen plain text in list for other usage
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  query_id : np.array, shape=(samples_num, )
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+    """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield querylist.query_id, query.relevance_score, np.array(
+            query.feature_vector)
+
+
+def gen_point(querylist):
+    """
+  gen item in list for point-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield query.relevance_score, np.array(query.feature_vector)
+
+
+def gen_pair(querylist, partial_order="full"):
+    """
+  gen pair for pair-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+  pairtial_order : "full" or "neighbour"
+    there is redudant in all possiable pair combinations, which can be simplifed
+  gen pairs for neighbour items or the full partial order pairs
+
+  return :
+  ------
+  label : np.array, shape=(1)
+  query_left : np.array, shape=(1, feature_dimension)
+  query_right : same as left
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    labels = []
+    docpairs = []
+
+    # C(n,2)
+    for i in range(len(querylist)):
+        query_left = querylist[i]
+        for j in range(i + 1, len(querylist)):
+            query_right = querylist[j]
+            if query_left.relevance_score > query_right.relevance_score:
+                labels.append([1])
+                docpairs.append([
+                    np.array(query_left.feature_vector),
+                    np.array(query_right.feature_vector)
+                ])
+            elif query_left.relevance_score < query_right.relevance_score:
+                labels.append([1])
+                docpairs.append([
+                    np.array(query_right.feature_vector),
+                    np.array(query_left.feature_vector)
+                ])
+    for label, pair in zip(labels, docpairs):
+        yield np.array(label), pair[0], pair[1]
+
+
+def gen_list(querylist):
+    """
+  gen item in list for list-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    relevance_score_list = [[query.relevance_score] for query in querylist]
+    feature_vector_list = [query.feature_vector for query in querylist]
+    yield np.array(relevance_score_list), np.array(feature_vector_list)
+
+
+def query_filter(querylists):
+    """
+    filter query get only document with label 0.
+    label 0, 1, 2 means the relevance score document with query
+    parameters :
+      querylist : QueyList list
+
+    return :
+      querylist : QueyList list
+    """
+    filter_query = []
+    for querylist in querylists:
+        relevance_score_list = [query.relevance_score for query in querylist]
+        if sum(relevance_score_list) != .0:
+            filter_query.append(querylist)
+    return filter_query
+
+
+def load_from_text(filepath, shuffle=False, fill_missing=-1):
+    """
+  parse data file into querys
+  """
+    prev_query_id = -1
+    querylists = []
+    querylist = None
+    fn = __initialize_meta_info__()
+    with open(os.path.join(fn, filepath)) as f:
+        for line in f:
+            query = Query()
+            query = query._parse_(line)
+            if query == None:
+                continue
+            if query.query_id != prev_query_id:
+                if querylist is not None:
+                    querylists.append(querylist)
+                querylist = QueryList()
+                prev_query_id = query.query_id
+            querylist._add_query(query)
+    if querylist is not None:
+        querylists.append(querylist)
+    return querylists
+
+
+def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
+    """
+  Parameters
+  --------
+  filename : string
+  fill_missing : fill the missing value. default in MQ2007 is -1
+  
+  Returns
+  ------
+  yield
+    label query_left, query_right  # format = "pairwise"
+    label querylist # format = "listwise"
+  """
+    querylists = query_filter(
+        load_from_text(
+            filepath, shuffle=shuffle, fill_missing=fill_missing))
+    for querylist in querylists:
+        if format == "plain_txt":
+            yield next(gen_plain_txt(querylist))
+        elif format == "pointwise":
+            yield next(gen_point(querylist))
+        elif format == "pairwise":
+            for pair in gen_pair(querylist):
+                yield pair
+        elif format == "listwise":
+            yield next(gen_list(querylist))
+
+
+train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt")
+test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt")
+
+
+def fetch():
+    return download(URL, "MQ2007", MD5)
+
+
+if __name__ == "__main__":
+    fetch()
+    mytest = functools.partial(
+        __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
+    for label, query in mytest():
+        print label, query
diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5461164fe6b816356e42fc7b7dcf388eccfadfb
--- /dev/null
+++ b/python/paddle/dataset/sentiment.py
@@ -0,0 +1,140 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The script fetch and preprocess movie_reviews data set that provided by NLTK
+
+TODO(yuyang18): Complete dataset.
+"""
+
+import collections
+from itertools import chain
+
+import nltk
+from nltk.corpus import movie_reviews
+
+import paddle.dataset.common
+
+__all__ = ['train', 'test', 'get_word_dict', 'convert']
+NUM_TRAINING_INSTANCES = 1600
+NUM_TOTAL_INSTANCES = 2000
+
+
+def download_data_if_not_yet():
+    """
+    Download the data set, if the data set is not download.
+    """
+    try:
+        # make sure that nltk can find the data
+        if paddle.dataset.common.DATA_HOME not in nltk.data.path:
+            nltk.data.path.append(paddle.dataset.common.DATA_HOME)
+        movie_reviews.categories()
+    except LookupError:
+        print "Downloading movie_reviews data set, please wait....."
+        nltk.download(
+            'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
+        print "Download data set success....."
+        print "Path is " + nltk.data.find('corpora/movie_reviews').path
+
+
+def get_word_dict():
+    """
+    Sorted the words by the frequency of words which occur in sample
+    :return:
+        words_freq_sorted
+    """
+    words_freq_sorted = list()
+    word_freq_dict = collections.defaultdict(int)
+    download_data_if_not_yet()
+
+    for category in movie_reviews.categories():
+        for field in movie_reviews.fileids(category):
+            for words in movie_reviews.words(field):
+                word_freq_dict[words] += 1
+    words_sort_list = word_freq_dict.items()
+    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
+    for index, word in enumerate(words_sort_list):
+        words_freq_sorted.append((word[0], index))
+    return words_freq_sorted
+
+
+def sort_files():
+    """
+    Sorted the sample for cross reading the sample
+    :return:
+        files_list
+    """
+    files_list = list()
+    neg_file_list = movie_reviews.fileids('neg')
+    pos_file_list = movie_reviews.fileids('pos')
+    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
+    return files_list
+
+
+def load_sentiment_data():
+    """
+    Load the data set
+    :return:
+        data_set
+    """
+    data_set = list()
+    download_data_if_not_yet()
+    words_ids = dict(get_word_dict())
+    for sample_file in sort_files():
+        words_list = list()
+        category = 0 if 'neg' in sample_file else 1
+        for word in movie_reviews.words(sample_file):
+            words_list.append(words_ids[word.lower()])
+        data_set.append((words_list, category))
+    return data_set
+
+
+def reader_creator(data):
+    """
+    Reader creator, generate an iterator for data set
+    :param data:
+        train data set or test data set
+    """
+    for each in data:
+        yield each[0], each[1]
+
+
+def train():
+    """
+    Default training set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
+
+
+def test():
+    """
+    Default test set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
+
+
+def fetch():
+    nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train, 1000, "sentiment_train")
+    paddle.dataset.common.convert(path, test, 1000, "sentiment_test")
diff --git a/python/paddle/dataset/tests/CMakeLists.txt b/python/paddle/dataset/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..485c38a13b573664d8033c237272a10ebb7c9701
--- /dev/null
+++ b/python/paddle/dataset/tests/CMakeLists.txt
@@ -0,0 +1 @@
+py_test(test_image SRCS test_image.py)
diff --git a/python/paddle/dataset/tests/cat.jpg b/python/paddle/dataset/tests/cat.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bc1fbbd371216b9904b522ed302700c79d2e4876
Binary files /dev/null and b/python/paddle/dataset/tests/cat.jpg differ
diff --git a/python/paddle/dataset/tests/cifar_test.py b/python/paddle/dataset/tests/cifar_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..839125b09dd5c6432e3572374a7345a77a43f7cf
--- /dev/null
+++ b/python/paddle/dataset/tests/cifar_test.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.cifar
+import unittest
+
+
+class TestCIFAR(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3072)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_test10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.cifar.test10())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_train10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.cifar.train10())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.cifar.test100())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 99)
+
+    def test_train100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.cifar.train100())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 99)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/common_test.py b/python/paddle/dataset/tests/common_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7cc02aa83061599ffefa18de6cb02ac0fc9e9b7
--- /dev/null
+++ b/python/paddle/dataset/tests/common_test.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.common
+import unittest
+import tempfile
+import glob
+
+
+class TestCommon(unittest.TestCase):
+    def test_md5file(self):
+        _, temp_path = tempfile.mkstemp()
+        with open(temp_path, 'w') as f:
+            f.write("Hello\n")
+        self.assertEqual('09f7e02f1290be211da707a266f153b3',
+                         paddle.dataset.common.md5file(temp_path))
+
+    def test_download(self):
+        yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
+        self.assertEqual(
+            paddle.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
+            paddle.dataset.common.download(yi_avatar, 'test',
+                                           'f75287202d6622414c706c36c16f8e0d'))
+
+    def test_split(self):
+        def test_reader():
+            def reader():
+                for x in xrange(10):
+                    yield x
+
+            return reader
+
+        _, temp_path = tempfile.mkstemp()
+        paddle.dataset.common.split(
+            test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
+        files = glob.glob(temp_path + '/test-%05d.pickle')
+        self.assertEqual(len(files), 3)
+
+    def test_cluster_file_reader(self):
+        _, temp_path = tempfile.mkstemp()
+        for x in xrange(5):
+            with open(temp_path + '/%05d.test' % x) as f:
+                f.write('%d\n' % x)
+        reader = paddle.dataset.common.cluster_files_reader(
+            temp_path + '/*.test', 5, 0)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str("0"))
+
+    def test_convert(self):
+        record_num = 10
+        num_shards = 4
+
+        def test_reader():
+            def reader():
+                for x in xrange(record_num):
+                    yield x
+
+            return reader
+
+        path = tempfile.mkdtemp()
+        paddle.dataset.common.convert(path,
+                                      test_reader(), num_shards,
+                                      'random_images')
+
+        files = glob.glob(path + '/random_images-*')
+        self.assertEqual(len(files), num_shards)
+
+        recs = []
+        for i in range(0, num_shards):
+            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
+            r = recordio.reader(n)
+            while True:
+                d = r.read()
+                if d is None:
+                    break
+                recs.append(d)
+
+        recs.sort()
+        self.assertEqual(total, record_num)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/flowers_test.py b/python/paddle/dataset/tests/flowers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..06260fd796ce0271b7cec2f42a8a5a255a02dc24
--- /dev/null
+++ b/python/paddle/dataset/tests/flowers_test.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.flowers
+import unittest
+
+
+class TestFlowers(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        size = 224 * 224 * 3
+        for l in reader():
+            self.assertEqual(l[0].size, size)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.flowers.train())
+        self.assertEqual(instances, 6149)
+        self.assertEqual(max_label_value, 102)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.flowers.test())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
+
+    def test_valid(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.flowers.valid())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/imdb_test.py b/python/paddle/dataset/tests/imdb_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..539da049449cd273db0a9e260851ed40e1be0f04
--- /dev/null
+++ b/python/paddle/dataset/tests/imdb_test.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.imdb
+import unittest
+import re
+
+TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
+TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
+TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
+
+TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
+TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
+TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
+
+
+class TestIMDB(unittest.TestCase):
+    word_idx = None
+
+    def test_build_dict(self):
+        if self.word_idx == None:
+            self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150)
+
+        self.assertEqual(len(self.word_idx), 7036)
+
+    def check_dataset(self, dataset, expected_size):
+        if self.word_idx == None:
+            self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150)
+
+        sum = 0
+        for l in dataset(self.word_idx):
+            self.assertEqual(l[1], sum % 2)
+            sum += 1
+        self.assertEqual(sum, expected_size)
+
+    def test_train(self):
+        self.check_dataset(paddle.dataset.imdb.train, 25000)
+
+    def test_test(self):
+        self.check_dataset(paddle.dataset.imdb.test, 25000)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/imikolov_test.py b/python/paddle/dataset/tests/imikolov_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..233fd9fc8cea4cd0b5cd052580030fc8c993693c
--- /dev/null
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.imikolov
+import unittest
+
+WORD_DICT = paddle.dataset.imikolov.build_dict()
+
+
+class TestMikolov(unittest.TestCase):
+    def check_reader(self, reader, n):
+        for l in reader():
+            self.assertEqual(len(l), n)
+
+    def test_train(self):
+        n = 5
+        self.check_reader(paddle.dataset.imikolov.train(WORD_DICT, n), n)
+
+        first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\
+            'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\
+            'rake regatta rubens sim snack-food ssangyong swapo wachter'
+        first_line = [
+            WORD_DICT.get(ch, WORD_DICT['<unk>'])
+            for ch in first_line.split(' ')
+        ]
+        for l in paddle.dataset.imikolov.train(
+                WORD_DICT, n=-1,
+                data_type=paddle.dataset.imikolov.DataType.SEQ)():
+            read_line = l[0][1:]
+            break
+        self.assertEqual(first_line, read_line)
+
+    def test_test(self):
+        n = 5
+        self.check_reader(paddle.dataset.imikolov.test(WORD_DICT, n), n)
+
+        first_line = 'consumers may want to move their telephones a little '\
+                'closer to the tv set'
+        first_line = [
+            WORD_DICT.get(ch, WORD_DICT['<unk>'])
+            for ch in first_line.split(' ')
+        ]
+        for l in paddle.dataset.imikolov.test(
+                WORD_DICT, n=-1,
+                data_type=paddle.dataset.imikolov.DataType.SEQ)():
+            read_line = l[0][1:]
+            break
+        self.assertEqual(first_line, read_line)
+
+    def test_total(self):
+        _, idx = zip(*WORD_DICT.items())
+        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/mnist_test.py b/python/paddle/dataset/tests/mnist_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ada19d3f2ee13e194d08e19a4b86b558c69a0a7
--- /dev/null
+++ b/python/paddle/dataset/tests/mnist_test.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.mnist
+import unittest
+
+
+class TestMNIST(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 784)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.mnist.train())
+        self.assertEqual(instances, 60000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.mnist.test())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/mq2007_test.py b/python/paddle/dataset/tests/mq2007_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba388724a8e84591df7150b41f8ea39a850fc31
--- /dev/null
+++ b/python/paddle/dataset/tests/mq2007_test.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.mq2007
+import unittest
+
+
+class TestMQ2007(unittest.TestCase):
+    def test_pairwise(self):
+        for label, query_left, query_right in paddle.dataset.mq2007.test(
+                format="pairwise"):
+            self.assertEqual(query_left.shape(), (46, ))
+            self.assertEqual(query_right.shape(), (46, ))
+
+    def test_listwise(self):
+        for label_array, query_array in paddle.dataset.mq2007.test(
+                format="listwise"):
+            self.assertEqual(len(label_array), len(query_array))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/dataset/tests/test_image.py b/python/paddle/dataset/tests/test_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bd56607ae1998935a3b3aaa0e3279515c2a540c
--- /dev/null
+++ b/python/paddle/dataset/tests/test_image.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.dataset.image as image
+
+
+class Image(unittest.TestCase):
+    def test_resize_flip_chw(self):
+        # resize
+        im = image.load_image('cat.jpg')
+        im = image.resize_short(im, 256)
+        self.assertEqual(256, min(im.shape[:2]))
+        self.assertEqual(3, im.shape[2])
+
+        # flip
+        im = image.left_right_flip(im)
+        im2 = np.flip(im, 1)
+        self.assertEqual(im.all(), im2.all())
+
+        # to_chw
+        h, w, c = im.shape
+        im = image.to_chw(im)
+        self.assertEqual(c, im.shape[0])
+        self.assertEqual(h, im.shape[1])
+        self.assertEqual(w, im.shape[2])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..543f4b7378b583ea3857bf785cf330c43e535c2a
--- /dev/null
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -0,0 +1,55 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import nltk
+import paddle.dataset.sentiment as st
+from nltk.corpus import movie_reviews
+
+
+class TestSentimentMethods(unittest.TestCase):
+    def test_get_word_dict(self):
+        word_dict = st.get_word_dict()[0:10]
+        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
+                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
+                          (u'is', 8), (u'in', 9)]
+        for idx, each in enumerate(word_dict):
+            self.assertEqual(each, test_word_list[idx])
+        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
+
+    def test_sort_files(self):
+        last_label = ''
+        for sample_file in st.sort_files():
+            current_label = sample_file.split("/")[0]
+            self.assertNotEqual(current_label, last_label)
+            last_label = current_label
+
+    def test_data_set(self):
+        data_set = st.load_sentiment_data()
+        last_label = -1
+        for each in st.test():
+            self.assertNotEqual(each[1], last_label)
+            last_label = each[1]
+        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
+        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
+        self.assertEqual(
+            len(list(st.test())),
+            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/voc2012_test.py b/python/paddle/dataset/tests/voc2012_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d285461a8ae8a9cc69fbec0dcf5efc106b594f0
--- /dev/null
+++ b/python/paddle/dataset/tests/voc2012_test.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.voc2012
+import unittest
+
+
+class TestVOC(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3 * l[1].size)
+            sum += 1
+        return sum
+
+    def test_train(self):
+        count = self.check_reader(paddle.dataset.voc_seg.train())
+        self.assertEqual(count, 2913)
+
+    def test_test(self):
+        count = self.check_reader(paddle.dataset.voc_seg.test())
+        self.assertEqual(count, 1464)
+
+    def test_val(self):
+        count = self.check_reader(paddle.dataset.voc_seg.val())
+        self.assertEqual(count, 1449)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/wmt16_test.py b/python/paddle/dataset/tests/wmt16_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b949d8bf5212d51016a33da322095bde2038200
--- /dev/null
+++ b/python/paddle/dataset/tests/wmt16_test.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.wmt16
+import unittest
+
+
+class TestWMT16(unittest.TestCase):
+    def checkout_one_sample(self, sample):
+        # train data has 3 field: source language word indices,
+        # target language word indices, and target next word indices.
+        self.assertEqual(len(sample), 3)
+
+        # test start mark and end mark in source word indices.
+        self.assertEqual(sample[0][0], 0)
+        self.assertEqual(sample[0][-1], 1)
+
+        # test start mask in target word indices
+        self.assertEqual(sample[1][0], 0)
+
+        # test en mask in target next word indices
+        self.assertEqual(sample[2][-1], 1)
+
+    def test_train(self):
+        for idx, sample in enumerate(
+                paddle.dataset.wmt16.train(
+                    src_dict_size=100000, trg_dict_size=100000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_test(self):
+        for idx, sample in enumerate(
+                paddle.dataset.wmt16.test(
+                    src_dict_size=1000, trg_dict_size=1000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_val(self):
+        for idx, sample in enumerate(
+                paddle.dataset.wmt16.validation(
+                    src_dict_size=1000, trg_dict_size=1000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_get_dict(self):
+        dict_size = 1000
+        word_dict = paddle.dataset.wmt16.get_dict("en", dict_size, True)
+        self.assertEqual(len(word_dict), dict_size)
+        self.assertEqual(word_dict[0], "<s>")
+        self.assertEqual(word_dict[1], "<e>")
+        self.assertEqual(word_dict[2], "<unk>")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a56e9d5563c76ab6f524ccea9191693dc227010
--- /dev/null
+++ b/python/paddle/dataset/uci_housing.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+UCI Housing dataset.
+
+This module will download dataset from
+https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
+parse training set and test set into paddle reader creators.
+"""
+
+import numpy as np
+import os
+import paddle.dataset.common
+
+__all__ = ['train', 'test']
+
+URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
+MD5 = 'd4accdce7a25600298819f8e28e8d593'
+feature_names = [
+    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
+    'PTRATIO', 'B', 'LSTAT', 'convert'
+]
+
+UCI_TRAIN_DATA = None
+UCI_TEST_DATA = None
+URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
+MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
+
+
+def feature_range(maximums, minimums):
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots()
+    feature_num = len(maximums)
+    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
+    ax.set_title('feature scale')
+    plt.xticks(range(feature_num), feature_names)
+    plt.xlim([-1, feature_num])
+    fig.set_figheight(6)
+    fig.set_figwidth(10)
+    if not os.path.exists('./image'):
+        os.makedirs('./image')
+    fig.savefig('image/ranges.png', dpi=48)
+    plt.close(fig)
+
+
+def load_data(filename, feature_num=14, ratio=0.8):
+    global UCI_TRAIN_DATA, UCI_TEST_DATA
+    if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
+        return
+
+    data = np.fromfile(filename, sep=' ')
+    data = data.reshape(data.shape[0] / feature_num, feature_num)
+    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
+        axis=0) / data.shape[0]
+    feature_range(maximums[:-1], minimums[:-1])
+    for i in xrange(feature_num - 1):
+        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
+    offset = int(data.shape[0] * ratio)
+    UCI_TRAIN_DATA = data[:offset]
+    UCI_TEST_DATA = data[offset:]
+
+
+def train():
+    """
+    UCI_HOUSING training set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    global UCI_TRAIN_DATA
+    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TRAIN_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def test():
+    """
+    UCI_HOUSING test set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
+    global UCI_TEST_DATA
+    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TEST_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def fetch():
+    paddle.dataset.common.download(URL, 'uci_housing', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train(), 1000, "uci_housing_train")
+    paddle.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
diff --git a/python/paddle/dataset/voc2012.py b/python/paddle/dataset/voc2012.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c945574dbcc15f5cee370206ed7e70ba8ab5014
--- /dev/null
+++ b/python/paddle/dataset/voc2012.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image dataset for segmentation.
+The 2012 dataset contains images from 2008-2011 for which additional
+segmentations have been prepared. As in previous years the assignment
+to training/test sets has been maintained. The total number of images
+with segmentation has been increased from 7,062 to 9,993.
+"""
+
+import tarfile
+import io
+import numpy as np
+from paddle.dataset.common import download
+from paddle.dataset.image import *
+from PIL import Image
+
+__all__ = ['train', 'test', 'val']
+
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
+VOCtrainval_11-May-2012.tar'
+
+VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
+SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
+DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
+LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
+
+CACHE_DIR = 'voc2012'
+
+
+def reader_creator(filename, sub_name):
+
+    tarobject = tarfile.open(filename)
+    name2mem = {}
+    for ele in tarobject.getmembers():
+        name2mem[ele.name] = ele
+
+    def reader():
+        set_file = SET_FILE.format(sub_name)
+        sets = tarobject.extractfile(name2mem[set_file])
+        for line in sets:
+            line = line.strip()
+            data_file = DATA_FILE.format(line)
+            label_file = LABEL_FILE.format(line)
+            data = tarobject.extractfile(name2mem[data_file]).read()
+            label = tarobject.extractfile(name2mem[label_file]).read()
+            data = Image.open(io.BytesIO(data))
+            label = Image.open(io.BytesIO(label))
+            data = np.array(data)
+            label = np.array(label)
+            yield data, label
+
+    return reader
+
+
+def train():
+    """
+    Create a train dataset reader containing 2913 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
+
+
+def test():
+    """
+    Create a test dataset reader containing 1464 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
+
+
+def val():
+    """
+    Create a val dataset reader containing 1449 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0908c737874fa7335cca5b5f0cba83190c9f90f
--- /dev/null
+++ b/python/paddle/dataset/wmt14.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+WMT14 dataset.
+The original WMT14 dataset is too large and a small set of data for set is
+provided. This module will download dataset from
+http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
+parse training set and test set into paddle reader creators.
+
+"""
+import tarfile
+import gzip
+
+import paddle.dataset.common
+
+__all__ = [
+    'train',
+    'test',
+    'get_dict',
+    'convert',
+]
+
+URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
+                'cslm_joint_paper/data/dev+test.tgz')
+MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
+# this is a small set of data for test. The original data is too large and
+# will be add later.
+URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
+             'wmt_shrinked_data/wmt14.tgz')
+MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
+# BLEU of this trained model is 26.92
+URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
+
+START = "<s>"
+END = "<e>"
+UNK = "<unk>"
+UNK_IDX = 2
+
+
+def __read_to_dict(tar_file, dict_size):
+    def __to_dict(fd, size):
+        out_dict = dict()
+        for line_count, line in enumerate(fd):
+            if line_count < size:
+                out_dict[line.strip()] = line_count
+            else:
+                break
+        return out_dict
+
+    with tarfile.open(tar_file, mode='r') as f:
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("src.dict")
+        ]
+        assert len(names) == 1
+        src_dict = __to_dict(f.extractfile(names[0]), dict_size)
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("trg.dict")
+        ]
+        assert len(names) == 1
+        trg_dict = __to_dict(f.extractfile(names[0]), dict_size)
+        return src_dict, trg_dict
+
+
+def reader_creator(tar_file, file_name, dict_size):
+    def reader():
+        src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
+        with tarfile.open(tar_file, mode='r') as f:
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith(file_name)
+            ]
+            for name in names:
+                for line in f.extractfile(name):
+                    line_split = line.strip().split('\t')
+                    if len(line_split) != 2:
+                        continue
+                    src_seq = line_split[0]  # one source sequence
+                    src_words = src_seq.split()
+                    src_ids = [
+                        src_dict.get(w, UNK_IDX)
+                        for w in [START] + src_words + [END]
+                    ]
+
+                    trg_seq = line_split[1]  # one target sequence
+                    trg_words = trg_seq.split()
+                    trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
+
+                    # remove sequence whose length > 80 in training mode
+                    if len(src_ids) > 80 or len(trg_ids) > 80:
+                        continue
+                    trg_ids_next = trg_ids + [trg_dict[END]]
+                    trg_ids = [trg_dict[START]] + trg_ids
+
+                    yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(dict_size):
+    """
+    WMT14 training set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'train/train', dict_size)
+
+
+def test(dict_size):
+    """
+    WMT14 test set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'test/test', dict_size)
+
+
+def gen(dict_size):
+    return reader_creator(
+        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'gen/gen', dict_size)
+
+
+def get_dict(dict_size, reverse=True):
+    # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
+    # else reverse = true, return dict = {'001':'a', '002':'b', ...}
+    tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
+    if reverse:
+        src_dict = {v: k for k, v in src_dict.items()}
+        trg_dict = {v: k for k, v in trg_dict.items()}
+    return src_dict, trg_dict
+
+
+def fetch():
+    paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    paddle.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    dict_size = 30000
+    paddle.dataset.common.convert(path, train(dict_size), 1000, "wmt14_train")
+    paddle.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad23338a96df6856c7e15cb5e3bb713021a55bf0
--- /dev/null
+++ b/python/paddle/dataset/wmt16.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ACL2016 Multimodal Machine Translation. Please see this website for more
+details: http://www.statmt.org/wmt16/multimodal-task.html#task1
+
+If you use the dataset created for your task, please cite the following paper:
+Multi30K: Multilingual English-German Image Descriptions.
+
+@article{elliott-EtAl:2016:VL16,
+ author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
+ title     = {Multi30K: Multilingual English-German Image Descriptions},
+ booktitle = {Proceedings of the 6th Workshop on Vision and Language},
+ year      = {2016},
+ pages     = {70--74},
+ year      = 2016
+}
+"""
+
+import os
+import tarfile
+import gzip
+from collections import defaultdict
+
+import paddle.dataset.common
+
+__all__ = [
+    "train",
+    "test",
+    "validation",
+    "convert",
+    "fetch",
+    "get_dict",
+]
+
+DATA_URL = ("http://cloud.dlnel.org/filepub/"
+            "?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed")
+DATA_MD5 = "0c38be43600334966403524a40dcd81e"
+
+TOTAL_EN_WORDS = 11250
+TOTAL_DE_WORDS = 19220
+
+START_MARK = "<s>"
+END_MARK = "<e>"
+UNK_MARK = "<unk>"
+
+
+def __build_dict(tar_file, dict_size, save_path, lang):
+    word_dict = defaultdict(int)
+    with tarfile.open(tar_file, mode="r") as f:
+        for line in f.extractfile("wmt16/train"):
+            line_split = line.strip().split("\t")
+            if len(line_split) != 2: continue
+            sen = line_split[0] if lang == "en" else line_split[1]
+            for w in sen.split():
+                word_dict[w] += 1
+
+    with open(save_path, "w") as fout:
+        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
+        for idx, word in enumerate(
+                sorted(
+                    word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
+            if idx + 3 == dict_size: break
+            fout.write("%s\n" % (word[0]))
+
+
+def __load_dict(tar_file, dict_size, lang, reverse=False):
+    dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "wmt16/%s_%d.dict" % (lang, dict_size))
+    if not os.path.exists(dict_path) or (
+            len(open(dict_path, "r").readlines()) != dict_size):
+        __build_dict(tar_file, dict_size, dict_path, lang)
+
+    word_dict = {}
+    with open(dict_path, "r") as fdict:
+        for idx, line in enumerate(fdict):
+            if reverse:
+                word_dict[idx] = line.strip()
+            else:
+                word_dict[line.strip()] = idx
+    return word_dict
+
+
+def __get_dict_size(src_dict_size, trg_dict_size, src_lang):
+    src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else
+                                        TOTAL_DE_WORDS))
+    trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else
+                                        TOTAL_ENG_WORDS))
+    return src_dict_size, trg_dict_size
+
+
+def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
+    def reader():
+        src_dict = __load_dict(tar_file, src_dict_size, src_lang)
+        trg_dict = __load_dict(tar_file, trg_dict_size,
+                               ("de" if src_lang == "en" else "en"))
+
+        # the indice for start mark, end mark, and unk are the same in source
+        # language and target language. Here uses the source language
+        # dictionary to determine their indices.
+        start_id = src_dict[START_MARK]
+        end_id = src_dict[END_MARK]
+        unk_id = src_dict[UNK_MARK]
+
+        src_col = 0 if src_lang == "en" else 1
+        trg_col = 1 - src_col
+
+        with tarfile.open(tar_file, mode="r") as f:
+            for line in f.extractfile(file_name):
+                line_split = line.strip().split("\t")
+                if len(line_split) != 2:
+                    continue
+                src_words = line_split[src_col].split()
+                src_ids = [start_id] + [
+                    src_dict.get(w, unk_id) for w in src_words
+                ] + [end_id]
+
+                trg_words = line_split[trg_col].split()
+                trg_ids = [trg_dict.get(w, unk_id) for w in trg_words]
+
+                trg_ids_next = trg_ids + [end_id]
+                trg_ids = [start_id] + trg_ids
+
+                yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 train set reader.
+
+    This function returns the reader for train data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+
+    NOTE:
+    The original like for training data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The train reader.
+    """
+
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type.  Only support: "
+                         "en (for English); de(for Germany).")
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                "wmt16.tar.gz"),
+        file_name="wmt16/train",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def test(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 test set reader.
+
+    This function returns the reader for test data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+    NOTE:
+    The original like for test data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The test reader.
+    """
+
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
+
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                "wmt16.tar.gz"),
+        file_name="wmt16/test",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def validation(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 validation set reader.
+
+    This function returns the reader for validation data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+    NOTE:
+    The original like for validation data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The validation reader.
+    """
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                "wmt16.tar.gz"),
+        file_name="wmt16/val",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def get_dict(lang, dict_size, reverse=False):
+    """
+    return the word dictionary for the specified language.
+
+    Args:
+        lang(string): A string indicating which language is the source
+                      language. Available options are: "en" for English
+                      and "de" for Germany.
+        dict_size(int): Size of the specified language dictionary.
+        reverse(bool): If reverse is set to False, the returned python
+                       dictionary will use word as key and use index as value.
+                       If reverse is set to True, the returned python
+                       dictionary will use index as key and word as value.
+
+    Returns:
+        dict: The word dictionary for the specific language.
+    """
+
+    if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS)
+    else: dict_size = min(dict_size, TOTAL_DE_WORDS)
+
+    dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "wmt16/%s_%d.dict" % (lang, dict_size))
+    assert os.path.exists(dict_path), "Word dictionary does not exist. "
+    "Please invoke paddle.dataset.wmt16.train/test/validation first "
+    "to build the dictionary."
+    tar_file = os.path.join(paddle.dataset.common.DATA_HOME, "wmt16.tar.gz")
+    return __load_dict(tar_file, dict_size, lang, reverse)
+
+
+def fetch():
+    """download the entire dataset.
+    """
+    paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                      "wmt16.tar.gz")
+
+
+def convert(path, src_dict_size, trg_dict_size, src_lang):
+    """Converts dataset to recordio format.
+    """
+
+    paddle.dataset.common.convert(
+        path,
+        train(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_train")
+    paddle.dataset.common.convert(
+        path,
+        test(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_test")
+    paddle.dataset.common.convert(
+        path,
+        validation(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_validation")
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 5ea4d977f4d8d9eb56b1fefa16f429df6e2a15bb..f01d638efddd471d5667fded183b90c2d7d0a856 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -31,7 +31,7 @@ import regularizer
 import average
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
-from core import LoDTensor, CPUPlace, CUDAPlace
+from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
 from distribute_transpiler import DistributeTranspiler
 from distribute_transpiler_simple import SimpleDistributeTranspiler
 from concurrency import (Go, make_channel, channel_send, channel_recv,
@@ -57,6 +57,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
     'LoDTensor',
     'CPUPlace',
     'CUDAPlace',
+    'CUDAPinnedPlace',
     'Tensor',
     'ParamAttr',
     'WeightNormParamAttr',
diff --git a/python/paddle/fluid/debuger.py b/python/paddle/fluid/debuger.py
index 7b4afa9bf65e1369329cd4648c1f5c4bd8fa8357..1c56064a1e8bdc5d975837cb5a75a40d557765ad 100644
--- a/python/paddle/fluid/debuger.py
+++ b/python/paddle/fluid/debuger.py
@@ -16,6 +16,7 @@ import sys
 import re
 from graphviz import GraphPreviewGenerator
 import proto.framework_pb2 as framework_pb2
+from google.protobuf import text_format
 
 _vartype2str_ = [
     "UNK",
@@ -100,7 +101,7 @@ def repr_var(vardesc):
 
 def pprint_program_codes(program_desc):
     reprs = []
-    for block_idx in range(program_desc.num_blocks()):
+    for block_idx in range(program_desc.desc.num_blocks()):
         block_desc = program_desc.block(block_idx)
         block_repr = pprint_block_codes(block_desc)
         reprs.append(block_repr)
@@ -127,7 +128,7 @@ def pprint_block_codes(block_desc, show_backward=False):
 
     if type(block_desc) is not framework_pb2.BlockDesc:
         block_desc = framework_pb2.BlockDesc.FromString(
-            block_desc.serialize_to_string())
+            block_desc.desc.serialize_to_string())
     var_reprs = []
     op_reprs = []
     for var in block_desc.vars:
@@ -237,13 +238,13 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
     # draw parameters and args
     vars = {}
     for var in desc.vars:
-        shape = [str(i) for i in var.lod_tensor.tensor.dims]
-        if not shape:
-            shape = ['null']
+        # TODO(gongwb): format the var.type
         # create var
         if var.persistable:
             varn = graph.add_param(
-                var.name, var.type, shape, highlight=need_highlight(var.name))
+                var.name,
+                str(var.type).replace("\n", "<br />", 1),
+                highlight=need_highlight(var.name))
         else:
             varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
         vars[var.name] = varn
@@ -268,4 +269,4 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
         for var in op.outputs:
             add_op_link_var(opn, var, True)
 
-    graph(path, show=True)
+    graph(path, show=False)
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 24297ffe33bc720ff7b4f2b0dbd82452dc7e0ae2..7a2a81be9f269f262160cd082ec3a1d8e8e46811 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -17,7 +17,7 @@ import framework
 from framework import Program, default_main_program, default_startup_program, Parameter, Variable
 import optimizer
 from layer_helper import LayerHelper
-from distributed_spliter import *
+import distributed_splitter as splitter
 import math
 from . import core
 import debuger
@@ -36,7 +36,7 @@ class VarBlock:
 
 class UnionFind(object):
     """ Union-find data struct.
-    
+
     Union-find is a data struct that keeps track of a set of elements partitioned
     into a number of disjoint (non-overlapping) subsets.
 
@@ -138,7 +138,7 @@ class DistributeTranspiler:
                   program=None,
                   pservers="127.0.0.1:6174",
                   trainers=1,
-                  split_method=round_robin):
+                  split_method=splitter.round_robin):
         """
             Transpile the program to distributed data-parallelism programs.
             The main_program will be transformed to use a remote parameter server
@@ -276,20 +276,25 @@ class DistributeTranspiler:
             suff_idx = v.name.find(".trainer_")
             if suff_idx >= 0:
                 orig_var_name = v.name[:suff_idx]
-            pserver_program.global_block().create_var(
+            else:
+                orig_var_name = v.name
+            single_trainer_var = pserver_program.global_block().create_var(
                 name=orig_var_name,
                 persistable=True,
                 type=v.type,
                 dtype=v.dtype,
                 shape=v.shape)
-            for trainer_id in xrange(self.trainers):
-                var = pserver_program.global_block().create_var(
-                    name="%s.trainer_%d" % (orig_var_name, trainer_id),
-                    persistable=False,
-                    type=v.type,
-                    dtype=v.dtype,
-                    shape=v.shape)
-                recv_inputs.append(var)
+            if self.trainers > 1:
+                for trainer_id in xrange(self.trainers):
+                    var = pserver_program.global_block().create_var(
+                        name="%s.trainer_%d" % (orig_var_name, trainer_id),
+                        persistable=False,
+                        type=v.type,
+                        dtype=v.dtype,
+                        shape=v.shape)
+                    recv_inputs.append(var)
+            else:
+                recv_inputs.append(single_trainer_var)
 
         # step3
         optimize_block = pserver_program.create_block(0)
@@ -298,7 +303,7 @@ class DistributeTranspiler:
         # If two ops are connected, we could add these two ops
         # into one set.
         ufind = self._create_ufind(self.optimize_ops)
-        # step 4.2 
+        # step 4.2
         # Iterate through the ops and append optimize op which
         # located on current pserver
         opt_op_on_pserver = []
@@ -307,7 +312,7 @@ class DistributeTranspiler:
                 opt_op_on_pserver.append(op)
         # step 4.3
         # Iterate through the ops, and if an op and the optimize ops
-        # which located on current pserver are in one set, then 
+        # which located on current pserver are in one set, then
         # append it into the sub program.
 
         # We try to put optimization program run parallelly, assume
@@ -403,11 +408,7 @@ class DistributeTranspiler:
         pserver_vars = pserver_program.global_block().vars
         created_var_map = dict()
         for _, var in pserver_vars.iteritems():
-            tmpvar = s_prog.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
+            tmpvar = s_prog.global_block().clone_variable(var)
             created_var_map[var.name] = tmpvar
 
         # 2. rename op outputs
@@ -511,8 +512,11 @@ class DistributeTranspiler:
 
     def _append_split_op(self, program, gradblocks):
         # Split variables that need to be split and append respective ops
+        add_suffix = False
+        if self.trainers > 1:
+            add_suffix = True
         var_mapping = self._create_vars_from_blocklist(
-            program, gradblocks, add_trainer_suffix=True)
+            program, gradblocks, add_trainer_suffix=add_suffix)
         for varname, splited_vars in var_mapping.iteritems():
             # variable that don't need to split have empty splited_vars
             if len(splited_vars) <= 1:
@@ -700,11 +704,7 @@ class DistributeTranspiler:
                 varlist = [varlist]
 
             for var in varlist:
-                program.global_block().create_var(
-                    name=var.name,
-                    persistable=var.persistable,
-                    dtype=var.dtype,
-                    shape=var.shape)
+                program.global_block().clone_variable(var)
 
         optimize_block.append_op(
             type=opt_op.type,
@@ -752,7 +752,7 @@ class DistributeTranspiler:
 
     def _is_opt_op(self, op):
         # NOTE: It's a HACK implement.
-        # optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc... 
+        # optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc...
         if "Param" in op.input_names and \
             "LearningRate" in op.input_names:
             return True
diff --git a/python/paddle/fluid/distributed_spliter.py b/python/paddle/fluid/distributed_splitter.py
similarity index 78%
rename from python/paddle/fluid/distributed_spliter.py
rename to python/paddle/fluid/distributed_splitter.py
index d288b27ba00970897d8121b82a9d51d5cf4ece09..060c1df8ad2badc5132f45ff0f44d136d828faa1 100644
--- a/python/paddle/fluid/distributed_spliter.py
+++ b/python/paddle/fluid/distributed_splitter.py
@@ -17,8 +17,10 @@ def hash_name(varlist, pserver_endpoints):
     """
     hash variable names to several endpoints.
 
-    :param varlist: a list of Variables
-    :return: a map of pserver endpoint -> varname
+    Args:
+        varlist(list): a list of Variables
+
+    Returns(dict): a map of pserver endpoint -> varname
     """
 
     def _hash_block(block_str, total):
@@ -34,9 +36,14 @@ def hash_name(varlist, pserver_endpoints):
 
 def round_robin(varlist, pserver_endpoints):
     """
-    distribute variables to several endpoints.
+    Distribute variables to several endpoints.
+    Args:
+        varlist(list): a list of variables
+        pserver_endpoints(list): a list of pserver endpoints
+
+    Returns(list[int]): the endpoint for each variable
     """
-    assert (len(varlist) > len(pserver_endpoints))
+    assert (len(varlist) >= len(pserver_endpoints))
 
     eplist = []
     pserver_idx = 0
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 2612fb1ae41986ae0d5c6e942cc3accebcb00e19..54d0a12bcdbb1b6c13e584dd1a3a5d73cddd4af7 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -48,8 +48,7 @@ def as_numpy(tensor):
     assert isinstance(tensor, core.LoDTensor)
     lod = tensor.lod()
     if len(lod) > 0:
-        raise RuntimeError(
-            "Some of your featched tensors hold LoD information. \
+        raise RuntimeError("Some of your fetched tensors hold LoD information. \
             They can not be completely cast to Python ndarray. \
             Please set the parameter 'return_numpy' as 'False' to \
             return LoDTensor itself directly.")
@@ -180,60 +179,24 @@ def get_program_cache_key(feed, fetch_list):
 
 
 class Executor(object):
-    def __init__(self, places):
-        if not isinstance(places, list) and not isinstance(places, tuple):
-            places = [places]
-
-        act_places = []
-        for each in places:
-            p = core.Place()
-            p.set_place(each)
-            act_places.append(p)
-
-        # TODO(dzhwinter) : only use the first place
-        self.executor = core.Executor(act_places[0])
-        self.places = places
+    def __init__(self, place):
+        self.place = place
+        p = core.Place()
+        p.set_place(place)
+        self.executor = core.Executor(p)
         self.program_caches = dict()
 
-    def aslodtensor(self, data):
-        def accumulate(data):
-            if not isinstance(data, list):
-                return 1
-            return sum([accumulate(sub) for sub in data])
-
-        def parselod(data):
-            seq_lens = [accumulate(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            return lod
-
-        assert len(self.places) != 0
-        if not isinstance(data, list):
-            # pure tensor case
-            tensor = core.LoDTensor()
-            tensor.set(data, self.places[0])
-            return tensor
-        else:
-            raise RuntimeError("Current implementation lacks unittests")
-            # lodtensor case
-            lod = []
-            if not isinstance(data[0], list):
-                lod.append(parselod(data))
-                flattened_data = np.concatenate(data, axis=0).astype("int64")
-            else:
-                while isinstance(data[0], list):
-                    lod.append(parselod(seq))
-                    flattened_data = [item for seq in data for item in seq]
-                    data = flattened_data
-                flattened_data = np.concatenate(data, axis=0).astype("int64")
-            flattened_data = flattened_data.reshape([len(flattened_data), 1])
-            tensor = core.LoDTensor()
-            tensor.set(flattened_data, self.places[0])
-            tensor.set_lod(lod)
-            return tensor
+    def as_lodtensor(self, data):
+        if isinstance(data, list):
+            raise RuntimeError("Some of your feed data hold LoD information. \
+                They can not be completely cast from a list of Python \
+                ndarray to LoDTensor. Please convert data to LoDTensor \
+                directly before feeding the data.\
+                ")
+        # single tensor case
+        tensor = core.LoDTensor()
+        tensor.set(data, self.place)
+        return tensor
 
     def _get_program_cache(self, program_cache_key):
         return self.program_caches.get(program_cache_key, None)
@@ -293,7 +256,7 @@ class Executor(object):
                 feed_target_name = op.desc.output('Out')[0]
                 cur_feed = feed[feed_target_name]
                 if not isinstance(cur_feed, core.LoDTensor):
-                    cur_feed = self.aslodtensor(cur_feed)
+                    cur_feed = self.as_lodtensor(cur_feed)
                 idx = op.desc.attr('col')
                 core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
             else:
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 3e78788f470556d2196b5104f69a0a3285543ec4..33cf6918178ff746a6b130af0e23a69de0f532fe 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -640,12 +640,26 @@ class Operator(object):
         """
         return self.desc.block_attr(name)
 
+    def all_attrs(self):
+        """
+        Get the attribute dict
+        Returns(dict): The Operator's attribute dict
+        """
+        attr_names = self.attr_names
+        attr_map = {}
+        for n in attr_names:
+            if n == 'sub_block':
+                attr_map[n] = self.block_attr(n)
+            else:
+                attr_map[n] = self.attr(n)
+        return attr_map
+
 
 class Block(object):
     def __init__(self, program, idx):
         self.desc = program.desc.block(idx)
         self.vars = dict()  # var_name --> var
-        self.ops = collections.deque()  # operator list
+        self.ops = list()  # operator list
         self.program = program
         self.removed_vars = dict()
 
@@ -817,6 +831,13 @@ class Block(object):
         self.ops.append(op)
         return op
 
+    def insert_op(self, index, *args, **kwargs):
+        self.sync_with_cpp()
+        op_desc = self.desc.insert_op(index)
+        op = Operator(block=self, desc=op_desc, *args, **kwargs)
+        self.ops.insert(index, op)
+        return op
+
     def delete_ops(self, ops):
         # remove from cpp
         # FIXME(typhoonzero): remove only the first occurrence.
@@ -828,17 +849,17 @@ class Block(object):
         self.desc.remove_op(start, end + 1)
 
     def slice_ops(self, start, end):
-        return list(self.ops)[start:end]
+        return self.ops[start:end]
 
     def prepend_op(self, *args, **kwargs):
         op_desc = self.desc.prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
-        self.ops.appendleft(op)
+        self.ops.insert(0, op)
         return op
 
     def sync_with_cpp(self):
         """
-        Sync with the desc on the c++ end.
+        Sync from the desc on the c++ end.
 
         This method is used to synchronize the c++ desc instance generated by backward.
         """
@@ -847,6 +868,11 @@ class Block(object):
             if not self.has_var(var.name()):
                 self.create_var(name=var.name(), desc=var, type=var.type())
 
+        # sync variables removed from c++ end
+        for var in self.vars.keys():
+            if not self.desc.find_var(var):
+                self.vars.pop(var)
+
         # sync operators from cpp
         ops_in_cpp = []
         for op_idx in range(0, self.desc.op_size()):
@@ -873,7 +899,7 @@ class Block(object):
         for index in range((start_index - 1 - 1), -1, -1):
             op_desc = ops_in_cpp[index]
             op = Operator(self, op_desc)
-            self.ops.appendleft(op)
+            self.ops.insert(0, op)
 
         # sync ops append to the end of cpp_ops
         for index in range((end_index + 1), len(ops_in_cpp)):
@@ -881,6 +907,19 @@ class Block(object):
             op = Operator(self, op_desc)
             self.ops.append(op)
 
+        # sync ops removed from c++ end
+        if end_index != -1 and end_index < len(self.ops):
+            ops_in_cpp_index = 0
+            ops_in_python_index = 0
+            while ops_in_python_index < len(
+                    self.ops) and ops_in_cpp_index < len(ops_in_cpp):
+                if self.ops[ops_in_python_index].desc != ops_in_cpp[
+                        ops_in_cpp_index]:
+                    del self.ops[ops_in_python_index]
+                else:
+                    ops_in_cpp_index += 1
+                    ops_in_python_index += 1
+
         assert len(self.ops) == len(ops_in_cpp)
         for index in range(len(self.ops)):
             assert self.ops[index].desc == ops_in_cpp[index]
@@ -928,13 +967,27 @@ class Block(object):
             The new  variable cloned from 'var' in current block.
         """
         assert isinstance(var, Variable)
-        return self.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            lod_level=var.lod_level,
-            persistable=True)
+        ret_var = None
+        # make STEP_SCOPES var can be safely cloned.
+        if var.type == core.VarDesc.VarType.STEP_SCOPES:
+            ret_var = self.create_var(
+                name=var.name, persistable=var.persistable, type=var.type)
+        elif var.type == core.VarDesc.VarType.SELECTED_ROWS:
+            ret_var = self.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                persistable=True)
+        else:
+            ret_var = self.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                lod_level=var.lod_level,
+                persistable=True)
+        return ret_var
 
 
 class Program(object):
diff --git a/python/paddle/fluid/graphviz.py b/python/paddle/fluid/graphviz.py
index b8d21344fc8f65f4025f28a195dab2d371b30292..125b4efa9d476e561bd78d0365cd92bbf7e66605 100644
--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
@@ -83,7 +83,7 @@ class Graph(object):
         file = open(dot_path, 'w')
         file.write(self.__str__())
         image_path = os.path.join(
-            os.path.dirname(__file__), dot_path[:-3] + "pdf")
+            os.path.dirname(dot_path), dot_path[:-3] + "pdf")
         cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
         subprocess.Popen(
             cmd,
@@ -199,7 +199,7 @@ class GraphPreviewGenerator(object):
         else:
             self.graph.show(path)
 
-    def add_param(self, name, data_type, shape, highlight=False):
+    def add_param(self, name, data_type, highlight=False):
         label = '\n'.join([
             '<<table cellpadding="5">',
             '  <tr>',
@@ -214,11 +214,6 @@ class GraphPreviewGenerator(object):
             str(data_type),
             '    </td>'
             '  </tr>',
-            '  <tr>',
-            '    <td>',
-            '[%s]' % 'x'.join(shape),
-            '    </td>'
-            '  </tr>',
             '</table>>',
         ])
         return self.graph.node(
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3e649dc5fd32c4ed8fa6ad273b7be04d552b51ae..a5938fe494265778ef7032c56a8d6d35acd729c5 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -19,7 +19,6 @@ from layer_function_generator import generate_layer_fn
 from layer_function_generator import autodoc
 from ..layer_helper import LayerHelper
 import tensor
-import ops
 import nn
 import math
 
@@ -58,7 +57,7 @@ def detection_output(loc,
 
     This operation is to get the detection results by performing following
     two steps:
-    
+
     1. Decode input bounding box predictions according to the prior boxes.
     2. Get the final detection results by applying multi-class non maximum
        suppression (NMS).
@@ -130,9 +129,9 @@ def detection_output(loc,
         target_box=loc,
         code_type='decode_center_size')
     old_shape = scores.shape
-    scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
+    scores = nn.reshape(x=scores, shape=(-1, old_shape[-1]))
     scores = nn.softmax(input=scores)
-    scores = ops.reshape(x=scores, shape=old_shape)
+    scores = nn.reshape(x=scores, shape=old_shape)
     scores = nn.transpose(scores, perm=[0, 2, 1])
     scores.stop_gradient = True
     nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
@@ -463,7 +462,7 @@ def ssd_loss(location,
     num, num_prior, num_class = confidence.shape
 
     def __reshape_to_2d(var):
-        return ops.reshape(x=var, shape=[-1, var.shape[-1]])
+        return nn.reshape(x=var, shape=[-1, var.shape[-1]])
 
     # 1. Find matched boundding box by prior box.
     #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
@@ -474,7 +473,7 @@ def ssd_loss(location,
 
     # 2. Compute confidence for mining hard examples
     # 2.1. Get the target label based on matched indices
-    gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, ))
     gt_label.stop_gradient = True
     target_label, _ = target_assign(
         gt_label, matched_indices, mismatch_value=background_label)
@@ -487,7 +486,7 @@ def ssd_loss(location,
     conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
 
     # 3. Mining hard examples
-    conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior))
+    conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior))
     conf_loss.stop_gradient = True
     neg_indices = helper.create_tmp_variable(dtype='int32')
     dtype = matched_indices.dtype
@@ -556,7 +555,7 @@ def ssd_loss(location,
     # 5.3 Compute overall weighted loss.
     loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
     # reshape to [N, Np], N is the batch size and Np is the prior box number.
-    loss = ops.reshape(x=loss, shape=[-1, num_prior])
+    loss = nn.reshape(x=loss, shape=[-1, num_prior])
     loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
     if normalize:
         normalizer = nn.reduce_sum(target_loc_weight)
@@ -709,7 +708,7 @@ def multi_box_head(inputs,
         new_shape = [
             -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)])
         ]
-        out = ops.reshape(x=input, shape=new_shape)
+        out = nn.reshape(x=input, shape=new_shape)
         return out
 
     def _is_list_or_tuple_(data):
@@ -803,7 +802,7 @@ def multi_box_head(inputs,
             mbox_loc.shape[0],
             mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
         ]
-        mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape)
+        mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape)
         mbox_locs.append(mbox_loc_flatten)
 
         # get conf
@@ -819,7 +818,7 @@ def multi_box_head(inputs,
             conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
             conf_loc.shape[3] / num_classes, num_classes
         ]
-        conf_loc_flatten = ops.reshape(conf_loc, shape=new_shape)
+        conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape)
         mbox_confs.append(conf_loc_flatten)
 
     if len(box_results) == 1:
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index bd7e9c30fed2c38a206bf17a646d8a4433af4099..969398bda4cfd0b2f5e39f45d34a1da9b216901f 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -255,7 +255,32 @@ def _copy_reader_var_(block, var):
     new_var.desc.set_shapes(var.desc.shapes())
     new_var.desc.set_dtypes(var.desc.dtypes())
     new_var.persistable = True
-    return monkey_patch_reader_methods(new_var)
+    return new_var
+
+
+def _copy_reader_create_op_(block, op):
+    input_param_names = op.input_names
+    new_input_map = {}
+    for param_name in input_param_names:
+        new_input_map[param_name] = []
+        arg_names = op.input(param_name)
+        for arg_name in arg_names:
+            new_input_map[param_name].append(block.var(arg_name))
+
+    output_param_names = op.output_names
+    new_output_map = {}
+    for param_name in output_param_names:
+        new_output_map[param_name] = []
+        arg_names = op.output(param_name)
+        for arg_name in arg_names:
+            new_output_map[param_name].append(block.var(arg_name))
+
+    new_op = block.append_op(
+        type=op.type,
+        inputs=new_input_map,
+        outputs=new_output_map,
+        attrs=op.all_attrs())
+    return new_op
 
 
 def open_recordio_file(filename, shapes, lod_levels, dtypes):
@@ -283,8 +308,9 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
 
     startup_var.desc.set_dtypes(dtypes)
     startup_var.persistable = True
-    return _copy_reader_var_(default_main_program().current_block(),
-                             startup_var)
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+    return monkey_patch_reader_methods(main_prog_var)
 
 
 def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
@@ -313,22 +339,25 @@ def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
 
     startup_var.desc.set_dtypes(dtypes)
     startup_var.persistable = True
-    return _copy_reader_var_(default_main_program().current_block(),
-                             startup_var)
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+    return monkey_patch_reader_methods(main_prog_var)
 
 
 def __create_decorated_reader__(op_type, reader, attrs):
     var_name = unique_name(op_type)
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=var_name)
-    startup_blk.append_op(
+    startop_op = startup_blk.append_op(
         type=op_type,
         inputs={'UnderlyingReader': reader},
         outputs={'Out': [startup_var]},
         attrs=attrs)
     startup_var.persistable = True
-    return _copy_reader_var_(default_main_program().current_block(),
-                             startup_var)
+    main_prog_block = default_main_program().current_block()
+    main_prog_var = _copy_reader_var_(main_prog_block, startup_var)
+    _copy_reader_create_op_(main_prog_block, startop_op)
+    return monkey_patch_reader_methods(main_prog_var)
 
 
 def create_shuffle_reader(reader, buffer_size):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0332556f62c46b187bd79841e4969d9da08b57a5..d2e7d58524bfb11627b6acb36ef873c41b348f0f 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -73,8 +73,10 @@ __all__ = [
     'smooth_l1',
     'one_hot',
     'autoincreased_step_counter',
+    'reshape',
     'lod_reset',
     'lrn',
+    'pad',
 ]
 
 
@@ -131,6 +133,8 @@ def fc(input,
         bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
             of this layer. If it is set to None, no bias will be added to the output units.
         act (str, default None): Activation to be applied to the output of this layer.
+        use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
+            library is installed. Default: False
         name (str, default None): The name of this layer.
 
     Returns:
@@ -151,38 +155,64 @@ def fc(input,
     dtype = helper.input_dtype()
 
     mul_results = []
-    for input_var, param_attr in helper.iter_inputs_and_params():
-        input_shape = input_var.shape
+    if use_mkldnn:
+        tmp = helper.create_tmp_variable(dtype)
+        input_shape = input.shape
         param_shape = [
             reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
         ] + [size]
 
         w = helper.create_parameter(
-            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
-        tmp = helper.create_tmp_variable(dtype)
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            is_bias=False)
+        if bias_attr is None or bias_attr is False:
+            bias_attr = False
+        else:
+            bias_attr = True
         helper.append_op(
-            type="mul",
-            inputs={"X": input_var,
-                    "Y": w},
+            type="fc",
+            inputs={"Input": input,
+                    "W": w},
             outputs={"Out": tmp},
-            attrs={
-                "x_num_col_dims": num_flatten_dims,
-                "y_num_col_dims": 1,
-                'use_mkldnn': use_mkldnn
-            })
-        mul_results.append(tmp)
-
-    # sum
-    if len(mul_results) == 1:
-        pre_bias = mul_results[0]
+            attrs={"use_mkldnn": use_mkldnn,
+                   "bias_attr": bias_attr})
+        return helper.append_activation(tmp)
     else:
-        pre_bias = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
-    # add bias
-    pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
-    # add activation
-    return helper.append_activation(pre_activation)
+        for input_var, param_attr in helper.iter_inputs_and_params():
+            input_shape = input_var.shape
+            param_shape = [
+                reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
+            ] + [size]
+
+            w = helper.create_parameter(
+                attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
+            tmp = helper.create_tmp_variable(dtype)
+            helper.append_op(
+                type="mul",
+                inputs={"X": input_var,
+                        "Y": w},
+                outputs={"Out": tmp},
+                attrs={
+                    "x_num_col_dims": num_flatten_dims,
+                    "y_num_col_dims": 1,
+                })
+            mul_results.append(tmp)
+
+        if len(mul_results) == 1:
+            pre_bias = mul_results[0]
+        else:
+            pre_bias = helper.create_tmp_variable(dtype)
+            helper.append_op(
+                type="sum",
+                inputs={"X": mul_results},
+                outputs={"Out": pre_bias})
+        # add bias
+        pre_activation = helper.append_bias_op(
+            pre_bias, dim_start=num_flatten_dims)
+        # add activation
+        return helper.append_activation(pre_activation)
 
 
 def embedding(input,
@@ -3265,6 +3295,8 @@ def one_hot(input, depth):
          The one-hot tensor or LodTensor, same as input.
 
     Examples:
+        .. code-block:: python
+
         X is a LoDTensor:
           X.lod = [[0, 1, 4]]
           X.shape = [4, 1]
@@ -3319,6 +3351,102 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     return counter
 
 
+def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
+    """
+    Gives a new shape to the input Tensor without changing its data.
+
+    The target shape can be given by :attr:`shape` or :attr:`actual_shape`.
+    :attr:`shape` is a list of integer while :attr:`actual_shape` is a tensor
+    variable. :attr:`actual_shape` has a higher priority than :attr:`shape`
+    if it is provided, while :attr:`shape` still should be set correctly to
+    gurantee shape inference in compile-time.
+
+    Some tricks exist when specifying the target shape.
+
+    1. -1 means the value of this dimension is inferred from the total element
+    number of x and remaining dimensions. Thus one and only one dimension can
+    be set -1.
+
+    2. 0 means the actual dimension value is going to be copied from the
+    corresponding dimension of x. The indice of 0s in shape can not exceed
+    Rank(X).
+
+    Here are some examples to explain it.
+
+    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with 
+    shape [6, 8] and leaving x's data unchanged.
+
+    2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    specified is [2, 3, -1, 2], the reshape operator will transform x into a
+    4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
+    case, one dimension of the target shape is set to -1, the value of this 
+    dimension is inferred from the total element number of x and remaining 
+    dimensions.
+
+    3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor
+    with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case,
+    besides -1, 0 means the actual dimension value is going to be copied from
+    the corresponding dimension of x.
+
+    Args:
+        input(variable): The input tensor.
+        shape(list): The new shape. At most one dimension of the new shape can
+                     be -1.
+        actual_shape(variable): An optional input. If provided, reshape
+                                according to this given shape rather than
+                                :attr:`shape` specifying shape. That is to
+                                say :attr:`actual_shape` has a higher priority
+                                than :attr:`shape`.
+        act (str): The non-linear activation to be applied to output variable.
+        inplace(bool): If this flag is set true, a new output tensor is created
+                       whose data is copied from input x, otherwise the output
+                       shares data with input without copying.
+
+    Returns(variable): The output tensor.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(
+                name='data', shape=[2, 4, 6], dtype='float32')
+            reshaped = fluid.layers.reshape(
+                x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True)
+    """
+
+    if not (isinstance(shape, list) or isinstance(shape, tuple)):
+        raise ValueError("Input shape must be a python lsit or tuple.")
+
+    # Validate the shape
+    unk_dim_idx = -1
+    for dim_idx, dim_size in enumerate(shape):
+        if dim_size == -1:
+            assert unk_dim_idx == -1, (
+                "Only one dimension in shape can be unknown.")
+            unk_dim_idx = dim_idx
+        elif dim_size == 0:
+            assert dim_idx < len(x.shape), (
+                "The indice of 0s in shape can not exceed Rank(X).")
+        else:
+            assert dim_size > 0, (
+                "Each dimension size given in shape must not be negtive "
+                "except one unknown dimension.")
+
+    helper = LayerHelper("reshape", **locals())
+    reshaped = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="reshape",
+        inputs={"X": x,
+                "Shape": actual_shape}
+        if isinstance(actual_shape, Variable) else {"X": x},
+        attrs={"shape": shape,
+               "inplace": inplace},
+        outputs={"Out": reshaped})
+
+    return helper.append_activation(reshaped)
+
+
 def lod_reset(x, y=None, target_lod=None):
     """
     LoD Reset Operator. Set LoD of **x** to a new one specified by **y** or
@@ -3482,3 +3610,62 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
                "beta": beta})
 
     return lrn_out
+
+
+def pad(x, paddings, pad_value=0., name=None):
+    """
+    Pads a tensor with a constant value given by :attr:`pad_value`, and the
+    padded width is specified by :attr:`paddings`. 
+
+    Specifically, the number of values padded before the contents of :attr:`x`
+    in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
+    of values padded after the contents of :attr:`x` in dimension :attr:`i` is
+    indicated by :attr:`paddings[i+1]`.
+
+    See below for an example.
+
+    .. code-block:: text
+
+        Given:
+            x = [[1, 2], [3, 4]]
+
+            paddings = [0, 1, 1, 2]
+
+            pad_value = 0
+
+        Return:
+
+            out = [[0, 1, 2, 0, 0]
+                   [0, 3, 4, 0, 0]
+                   [0, 0, 0, 0, 0]]
+
+    Args:
+        x (Variable): The input tensor variable.
+        paddings (list): A list of integers. Its elements specify the padded
+                         width before and after for each dimension in turn.
+                         The length of :attr:paddings must be 
+                         :math:`rank(x) \\times 2`.
+        pad_value (float): The constant value used to pad.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The padded tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a rank 2 tensor variable.
+            out = fluid.layers.pad(
+                x=x, paddings=[0, 1, 1, 2], pad_value=0.)
+    """
+    helper = LayerHelper('pad', input=x, **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='pad',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'paddings': paddings,
+               'pad_value': float(pad_value)})
+    return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 0e5987ee598158d189db8bc956b7e7fea2517554..a9fe25744cc0b385479c9366af1b731ec221dd5a 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -49,7 +49,6 @@ __activations__ = [
 __all__ = [
     'mean',
     'mul',
-    'reshape',
     'scale',
     'sigmoid_cross_entropy_with_logits',
     'elementwise_add',
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 5e0588fa73241a8752e1b3195a123820165f070d..b93f2f974ca28cfd8d03c0dbbf1d401620a15e53 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -21,42 +21,124 @@ __all__ = ['ParallelExecutor']
 
 
 class ParallelExecutor(object):
-    def __init__(self, loss_name, use_cuda, num_threads=None):
-        places = []
+    def __init__(self,
+                 use_cuda,
+                 loss_name=None,
+                 main_program=None,
+                 num_threads=None,
+                 allow_op_delay=False,
+                 share_vars_from=None):
+        """
+        ParallelExecutor can run program in parallel.
+
+        Args:
+            use_cuda(bool): Whether to use CUDA or not.
+            loss_name(str, default None): The loss name must set in training.
+            main_program(Program, default None): The program that need to run,
+                if not provided, then default_main_program will be used.
+            num_threads(int, default None): How many threads are used for
+                training.
+            allow_op_delay(bool, default False): Whether to delay and buffer
+                some operators together for scheduling or not, which may
+                improve performance in some cases, defalut False.
+            share_vars_from(ParallelExecutor, default None): If provied,
+                it will share variables from the specified ParallelExecutor.
+
+        Returns:
+            A ParallelExecutor object.
+
+        Raises:
+            TypeError: If share_vars_from is provided, but not ParallelExecutor
+                object.
+
+        Examples:
+            .. code-block:: python
+
+              train_exe = fluid.ParallelExecutor(
+                  use_cuda=True, loss_name=loss.name)
+              test_exe = fluid.ParallelExecutor(
+                  use_cuda=True,
+                  main_program=test_program,
+                  share_vars_from=train_exe)
+
+              train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
+              test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+        """
+
+        self._places = []
+        self._act_places = []
         if use_cuda:
             for i in xrange(core.get_cuda_device_count()):
                 p = core.Place()
-                p.set_place(core.CUDAPlace(i))
-                places.append(p)
+                self._act_places.append(core.CUDAPlace(i))
+                p.set_place(self._act_places[-1])
+                self._places.append(p)
         else:
             for i in xrange(multiprocessing.cpu_count()):
                 p = core.Place()
-                p.set_place(core.CPUPlace())
-                places.append(p)
+                self._act_places.append(core.CPUPlace(i))
+                p.set_place(self._act_places[-1])
+                self._places.append(p)
+        assert self._places, "no place for execution"
 
         if num_threads is None:
-            num_threads = min(len(places) * 2, multiprocessing.cpu_count())
+            if use_cuda:
+                # Experiments on se-resnext shows that too many threads hurt
+                # performance. Worth tunning for other models in the future.
+                num_threads = len(self._places)
+            else:
+                min(len(self._places) * 2, multiprocessing.cpu_count())
 
-        startup = framework.default_startup_program()
-        main = framework.default_main_program()
+        main = main_program
+        main = main if main else framework.default_main_program()
         scope = executor.global_scope()
 
+        if share_vars_from and not isinstance(share_vars_from,
+                                              ParallelExecutor):
+            raise TypeError("share_vars_from must be ParallelExecutor.")
+        local_scopes = share_vars_from.executor.local_scopes(
+        ) if share_vars_from else []
+
+        persistable_vars = [
+            v.name
+            for v in filter(lambda var: var.persistable, main.list_vars())
+        ]
+
         self.executor = core.ParallelExecutor(
             num_threads,
             True if use_cuda else False,  # use_event
-            places,
+            self._places,
             set([
                 p.name for p in main.global_block().iter_parameters()
                 if not p.stop_gradient
             ]),
-            startup.desc,
+            set(persistable_vars),
             main.desc,
-            loss_name,
-            scope)
+            loss_name if loss_name else '',
+            scope,
+            local_scopes,
+            allow_op_delay)
         self.scope = scope
 
-    def run(self, fetch_list):
+    def run(self, fetch_list, feed_dict={}):
+        """
+        :param fetch_list: A list of variable names that will be fetched.
+        :param feed_dict: A dict mapping for feed variable name to LoDTensor
+          or numpy array.
+        :return: fetched value list.
+        """
+        if not isinstance(feed_dict, dict):
+            raise TypeError("feed_dict should be a dict")
+
+        feed_tensor_dict = {}
+        for i, feed_name in enumerate(feed_dict):
+            feed_tensor = feed_dict[feed_name]
+            if not isinstance(feed_tensor, core.LoDTensor):
+                feed_tensor = core.LoDTensor()
+                feed_tensor.set(feed_dict[feed_name], self._act_places[0])
+            feed_tensor_dict[feed_name] = feed_tensor
+
         fetch_var_name = '@FETCHED_VAR_NAME@'
-        self.executor.run(fetch_list, fetch_var_name)
+        self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
         arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
         return [arr[i] for i in range(len(arr))]
diff --git a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
index 983f8f4dbeac83566839de25ec9765eb248be768..ce640dece8a5067bd10f410a2bb58874b7cc0908 100644
--- a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
+++ b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 93ef66851b0efd65361122853dadeefe11992ed5..6dfc2997ae0328a41fe22d13dfa8fc51d4d021a6 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import contextlib
 import numpy
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index b01c1875d64d7fc14e0141672f7e8eab2b6a0394..e8bb082be196b6342b1719235f1264bbe3d776ac 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import contextlib
 import math
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index f488527e0bc69059bc44422aa28188441f3d5b54..c0a6df831acbfe2654a5941cf95c91343992ef13 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -15,8 +15,8 @@
 import math
 
 import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.dataset.conll05 as conll05
+import paddle
+import paddle.dataset.conll05 as conll05
 import paddle.fluid as fluid
 from paddle.fluid.initializer import init_on_cpu
 import contextlib
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 3a1a0859ecfd4ac5337e2112f8b22e32d8474f22..830d78df8b9e56b45f7e928562ef4b89e88f696d 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -14,7 +14,7 @@
 import contextlib
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 import paddle.fluid.layers as pd
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index e85b97a7f430b6d752baa179f27a7d15bc4d9a81..e4997b4069f60ff4382b4254bc026ae8ae29b345 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -14,7 +14,7 @@
 from __future__ import print_function
 import argparse
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import sys
 import numpy
 import unittest
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 2ce66d32c993672793b0db213267d1f80b5c49dd..2172c275b8082689a6ff5f2c3c27a2ff4e92275a 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -16,7 +16,7 @@ import math
 import sys
 import os
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 import paddle.fluid.layers as layers
diff --git a/python/paddle/fluid/tests/book/test_understand_sentiment.py b/python/paddle/fluid/tests/book/test_understand_sentiment.py
index d2f3f7404697feb0768f873070b97aeb3ba0cd64..dedd153778d7ad9caeb5fa7090a980bc7f177dea 100644
--- a/python/paddle/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/test_understand_sentiment.py
@@ -15,7 +15,7 @@ from __future__ import print_function
 
 import unittest
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import contextlib
 import math
 import numpy as np
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 26b97c3e254f54b83515436660e44d4908c98fbe..8929779de9448d036e1528b64330b37463ab3988 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import unittest
 import os
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index ad79e96b958b36a06c8a3cc990dbe3608e32c9ac..8818cf96fa8f08036f9e23aae786f67b5614b2b9 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import math
 import sys
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index 204669d7e6176e9e8250e8aebc2d10441fa24b67..dfebb9a06ea4f290f128c486dcaccaeccdcef8c4 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import sys
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import math
 import sys
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index a24834a6f0b19d1265f6c8d7089d31583af82d1f..a1ca6d981fafb401985d03e9f2d63d1cb41b21b5 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
diff --git a/python/paddle/fluid/tests/demo/fc_gan.py b/python/paddle/fluid/tests/demo/fc_gan.py
index 7452ea2a34aa0c75d8e0990639b29705033af98b..8ea1b2b15cc0c0eb5bca67a9c5a6ac6c6774e7e2 100644
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
@@ -19,7 +19,7 @@ import os
 import matplotlib
 import numpy
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 matplotlib.use('Agg')
diff --git a/python/paddle/fluid/tests/test_cpp_reader.py b/python/paddle/fluid/tests/test_cpp_reader.py
index 4b0d039b7e05a55980946a8949e32802e9e57c20..e54c73b2956dd99ee57804318130c261e133d21a 100644
--- a/python/paddle/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index b2fd5ae29c724da52df0a5d3cb56d2ec9e5530f3..89f4c64975802dc1827ec17ed3626b91e36d6971 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 BATCH_SIZE = 128
diff --git a/python/paddle/fluid/tests/test_gradient_clip.py b/python/paddle/fluid/tests/test_gradient_clip.py
index 68b682f68b1fd147b821cfdb1e0866cf8aa04bff..d530601f13be6810a8a99b13c92faf584df568f9 100644
--- a/python/paddle/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/test_gradient_clip.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 BATCH_SIZE = 128
diff --git a/python/paddle/fluid/tests/test_mnist_if_else_op.py b/python/paddle/fluid/tests/test_mnist_if_else_op.py
index 94395f6cfb4648967558ed265e798e3505c20fc1..d34f52db5ffc889f17513d034ad2c99f696b0cdf 100644
--- a/python/paddle/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/fluid/tests/test_mnist_if_else_op.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard, default_main_program, default_startup_program
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import MomentumOptimizer
 import paddle.fluid.core as core
-import paddle.v2 as paddle
 import unittest
 import numpy as np
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 0ad273c7161977e18f91f952fd3a9dc144bf73f0..f10ef9b63412ecf74471f4fb94eb91ac72d5f8f9 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -22,13 +22,14 @@ function(py_test_modules TARGET_NAME)
     set(multiValueArgs MODULES DEPS ARGS ENVS)
     cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_modules_ENVS}
+             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
              ${PYTHON_EXECUTABLE} -u -m unittest --verbose ${py_test_modules_MODULES} ${py_test_modules_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
 endfunction()
 
 # test time consuming OPs in a separate process for expliot parallism
+list(REMOVE_ITEM TEST_OPS test_parallel_executor)
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dyn_rnn)
 list(REMOVE_ITEM TEST_OPS test_mul_op)
@@ -64,6 +65,7 @@ else()
 endif(WITH_FAST_BUNDLE_TEST)
 
 # tests with high overhead
+py_test_modules(test_parallel_executor MODULES test_parallel_executor)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
 py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn)
 py_test_modules(test_mul_op MODULES test_mul_op)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 8393f7827b1c7d361ebea72f2cfc6033268772f0..299ab8e51f017e1980a8b40e3830fc42b1ff7ccc 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase):
                     np.allclose(
                         actual_t, expect_t, atol=atol),
                     "Output (" + out_name + ") has diff at " + str(place) +
-                    str(actual_t) + str(expect_t))
+                    str(actual_t) + "\n" + str(expect_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.lod(), expect[1],
                                          "Output (" + out_name +
@@ -568,6 +568,6 @@ class OpTest(unittest.TestCase):
 
         fetch_list = [g for p, g in param_grad_list]
         executor = Executor(place)
-        return map(
-            np.array,
-            executor.run(prog, feed_dict, fetch_list, return_numpy=False))
+        return map(np.array,
+                   executor.run(prog, feed_dict, fetch_list,
+                                return_numpy=False))
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index fb162f8b7315936824ad40aca0c99e4dd09f9734..c5b53902bca90ae2260a7cda43e6866f897233b3 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -535,9 +535,37 @@ class TestSwish(OpTest):
 
 
 #--------------------test MKLDNN--------------------
-class TestMKLDNNRelu(TestRelu):
+class TestMKLDNNReluDim2(TestRelu):
     def setUp(self):
-        super(TestMKLDNNRelu, self).setUp()
+        super(TestMKLDNNReluDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNTanhDim2(TestTanh):
+    def setUp(self):
+        super(TestMKLDNNTanhDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNSqrtDim2(TestSqrt):
+    def setUp(self):
+        super(TestMKLDNNSqrtDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNAbsDim2(TestAbs):
+    def setUp(self):
+        super(TestMKLDNNAbsDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNReluDim4(TestRelu):
+    def setUp(self):
+        super(TestMKLDNNReluDim4, self).setUp()
 
         x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
         # The same reason with TestAbs
@@ -549,9 +577,9 @@ class TestMKLDNNRelu(TestRelu):
         self.attrs = {"use_mkldnn": True}
 
 
-class TestMKLDNNTanh(TestTanh):
+class TestMKLDNNTanhDim4(TestTanh):
     def setUp(self):
-        super(TestMKLDNNTanh, self).setUp()
+        super(TestMKLDNNTanhDim4, self).setUp()
 
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
@@ -560,9 +588,9 @@ class TestMKLDNNTanh(TestTanh):
         self.attrs = {"use_mkldnn": True}
 
 
-class TestMKLDNNSqrt(TestSqrt):
+class TestMKLDNNSqrtDim4(TestSqrt):
     def setUp(self):
-        super(TestMKLDNNSqrt, self).setUp()
+        super(TestMKLDNNSqrtDim4, self).setUp()
 
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
@@ -571,9 +599,9 @@ class TestMKLDNNSqrt(TestSqrt):
         self.attrs = {"use_mkldnn": True}
 
 
-class TestMKLDNNAbs(TestAbs):
+class TestMKLDNNAbsDim4(TestAbs):
     def setUp(self):
-        super(TestMKLDNNAbs, self).setUp()
+        super(TestMKLDNNAbsDim4, self).setUp()
 
         x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
         # The same reason with TestAbs
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 4b6e3fb69a12095c77f343515fe3b6d1f3fccb14..65606a0b4373b28036096cf046da5143a3b8bcd0 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -97,8 +97,11 @@ class TestConv2dOp(OpTest):
         }
         self.outputs = {'Output': output}
 
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
+
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
@@ -107,7 +110,7 @@ class TestConv2dOp(OpTest):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place,
@@ -121,7 +124,7 @@ class TestConv2dOp(OpTest):
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Input'],
@@ -138,7 +141,7 @@ class TestConv2dOp(OpTest):
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Filter'],
diff --git a/python/paddle/fluid/tests/unittests/test_debugger.py b/python/paddle/fluid/tests/unittests/test_debugger.py
index 2b7bbf9218f9b8fd8f5b29ac3cbc2f9680f471eb..67b03f635b6f8a3003efabe5425325080d47f61c 100644
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
@@ -51,7 +51,9 @@ class TestDebugger(unittest.TestCase):
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
 
-        print(debuger.pprint_program_codes(p.desc))
+        print(debuger.pprint_program_codes(p))
+
+        debuger.draw_block_graphviz(p.block(0), path="./test.dot")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index df7ab0d29bdfc9410cd7dd4a8f2a7cd440ef4aba..0faed94deb4808783027d776e0f4c61da0db457a 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import unittest
 import numpy
 
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index b03a70f1b9e61162d37541ffeba8510fc11c605a..d3f63ee2c414a71309be8f0af6d3e5912078ecdb 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid.backward import append_backward
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f547f3c484bf034a87823a75d946ef130a5cb70
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def fully_connected_naive(input, weights, bias_data=None):
+    in_n, in_c, in_h, in_w = input.shape
+    w_h, w_c = weights.shape
+
+    x_data = np.reshape(input, [in_n, in_c * in_h * in_w])
+    w_data = np.transpose(np.reshape(weights, (w_c, in_c * in_h * in_w)))
+    result = None
+
+    if not bias_data:
+        result = np.dot(x_data, w_data)
+    else:
+        result = np.dot(x_data, w_data) + bias_data
+
+    return result
+
+
+class MatrixGenerate:
+    def __init__(self, mb, ic, oc, h, w):
+        self.input = np.random.random((mb, ic, h, w)).astype("float32")
+        self.weights = np.random.random((ic * h * w, oc)).astype("float32")
+
+
+class TestFCMKLDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "fc"
+        self.use_mkldnn = True
+        self.with_bias = True
+        self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
+
+        self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
+
+        self.attrs = {
+            'use_mkldnn': self.use_mkldnn,
+            'with_bias': self.with_bias
+        }
+
+        self.outputs = {
+            'Out': fully_connected_naive(self.matrix.input, self.matrix.weights)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(set(['Input', 'W']), 'Out', max_relative_error=0.9)
+
+    def test_check_grad_no_weight(self):
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.5, no_grad_set=set('W'))
+
+
+class TestFCMKLDNNOp1(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.matrix = MatrixGenerate(2, 15, 48, 2, 2)
+
+
+class TestFCMKLDNNOp2(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.matrix = MatrixGenerate(2, 32, 40, 1, 1)
+
+
+class TestFCMKLDNNOp3(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.matrix = MatrixGenerate(2, 2, 4, 1, 1)
+
+
+class TestFCMKLDNNOp4(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.with_bias = False
+        self.matrix = MatrixGenerate(2, 32, 48, 2, 2)
+
+
+class TestFCMKLDNNOp4(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.with_bias = False
+        self.matrix = MatrixGenerate(2, 32, 1000, 6, 6)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index ed920ad388ff0e01887404e70fe82565b4cd28fa..f8d5785fbfe64843f4aa3b96b24809df60980c74 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -96,5 +96,47 @@ class TestLookupTableIdsIsSelectedRows(OpTest):
             self.check_with_place(place)
 
 
+class TestLookupTableWIsSelectedRows(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Id Variable
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
+        ids_tensor.set(ids_array, place)
+
+        # create and initialize W Variable
+        rows = [0, 1, 2, 3, 4, 5, 6]
+        row_numel = 12
+
+        w_selected_rows = scope.var('W').get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+        # create Out Variable
+        out_tensor = scope.var('Out').get_tensor()
+
+        # create and run lookup_table operator
+        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
+        lookup_table.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out_tensor)
+        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
+        for idx, row in enumerate(ids_array):
+            assert (row[0] == result_array[idx]).all()
+
+    def test_w_is_selected_rows(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
old mode 100755
new mode 100644
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
index 8add353303e3626bbce68199a100306d4858766a..0b7a29075939a548320185947b5afa7261029d49 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -15,8 +15,8 @@
 import unittest
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
+import paddle
+import paddle.dataset.mnist as mnist
 
 
 class TestMultipleReader(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_multiple_reader.py b/python/paddle/fluid/tests/unittests/test_multiple_reader.py
index 69f8acf81efaba8fc0f3df4cfe3a42dc4e477df2..a60a5d6c4af2b6b3652d0fe2089018b9403eee25 100644
--- a/python/paddle/fluid/tests/unittests/test_multiple_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multiple_reader.py
@@ -15,8 +15,8 @@
 import unittest
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
+import paddle
+import paddle.dataset.mnist as mnist
 from shutil import copyfile
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index bbfd03c638dac64de24c0b363f8342d8485f1223..8401716db88ef3dda68644a052d78b4476c9fdc7 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -16,18 +16,22 @@ import numpy
 import unittest
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
-import paddle.v2.dataset.wmt16 as wmt16
-
-
-def simple_fc_net():
-    reader = fluid.layers.open_recordio_file(
-        filename='./mnist.recordio',
-        shapes=[[-1, 784], [-1, 1]],
-        lod_levels=[0, 0],
-        dtypes=['float32', 'int64'])
-    img, label = fluid.layers.read_file(reader)
+import paddle
+import paddle.dataset.mnist as mnist
+import paddle.dataset.wmt16 as wmt16
+
+
+def simple_fc_net(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_recordio_file(
+            filename='./mnist.recordio',
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        img, label = fluid.layers.read_file(reader)
     hidden = img
     for _ in xrange(4):
         hidden = fluid.layers.fc(
@@ -42,13 +46,18 @@ def simple_fc_net():
     return loss
 
 
-def fc_with_batchnorm():
-    reader = fluid.layers.open_recordio_file(
-        filename='./mnist.recordio',
-        shapes=[[-1, 784], [-1, 1]],
-        lod_levels=[0, 0],
-        dtypes=['float32', 'int64'])
-    img, label = fluid.layers.read_file(reader)
+def fc_with_batchnorm(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_recordio_file(
+            filename='./mnist.recordio',
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        img, label = fluid.layers.read_file(reader)
+
     hidden = img
     for _ in xrange(1):
         hidden = fluid.layers.fc(
@@ -135,24 +144,26 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
     return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
 
 
-def SE_ResNeXt152(batch_size=4):
+def SE_ResNeXt50Small(batch_size=2, use_feed=False):
+    assert not use_feed, "SE_ResNeXt doesn't support feed yet"
+
     img = fluid.layers.fill_constant(
         shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
     label = fluid.layers.fill_constant(
         shape=[batch_size, 1], dtype='int64', value=0.0)
 
     conv = conv_bn_layer(
-        input=img, num_filters=64, filter_size=3, stride=2, act='relu')
+        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
     conv = conv_bn_layer(
-        input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
     conv = conv_bn_layer(
-        input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
     conv = fluid.layers.pool2d(
         input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
 
-    cardinality = 64
+    cardinality = 32
     reduction_ratio = 16
-    depth = [3, 8, 36, 3]
+    depth = [3, 4, 6, 3]
     num_filters = [128, 256, 512, 1024]
 
     for block in range(len(depth)):
@@ -184,27 +195,33 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   method,
                                   memory_opt=True,
                                   iter=10,
-                                  batch_size=None):
+                                  batch_size=None,
+                                  allow_op_delay=False,
+                                  feed_dict={}):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
-            loss = method()
+            loss = method(use_feed=len(feed_dict) > 0)
             adam = fluid.optimizer.Adam()
             adam.minimize(loss)
             if memory_opt:
                 fluid.memory_optimize(main)
 
-            exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True)
+            place = fluid.CUDAPlace(0)
+            startup_exe = fluid.Executor(place)
+            startup_exe.run(startup)
+
+            exe = fluid.ParallelExecutor(True, loss_name=loss.name)
             if batch_size is not None:
                 batch_size *= fluid.core.get_cuda_device_count()
             begin = time.time()
-            first_loss, = exe.run([loss.name])
+            first_loss, = exe.run([loss.name], feed_dict=feed_dict)
             first_loss = numpy.array(first_loss)
 
             for i in xrange(iter):
-                exe.run([])
+                exe.run([], feed_dict=feed_dict)
 
-            last_loss, = exe.run([loss.name])
+            last_loss, = exe.run([loss.name], feed_dict=feed_dict)
             end = time.time()
 
             if batch_size is not None:
@@ -222,7 +239,7 @@ class TestMNIST(TestParallelExecutorBase):
     def setUpClass(cls):
         # Convert mnist to recordio file
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            reader = paddle.batch(mnist.train(), batch_size=32)
+            reader = paddle.batch(mnist.train(), batch_size=4)
             feeder = fluid.DataFeeder(
                 feed_list=[  # order is image and label
                     fluid.layers.data(
@@ -236,9 +253,21 @@ class TestMNIST(TestParallelExecutorBase):
 
     def test_simple_fc(self):
         self.check_network_convergence(simple_fc_net)
+        self.check_network_convergence(simple_fc_net, allow_op_delay=True)
+
+        img = numpy.zeros(shape=[32, 784], dtype='float32')
+        label = numpy.ones(shape=[32, 1], dtype='int64')
+        self.check_network_convergence(
+            simple_fc_net, feed_dict={"image": img,
+                                      "label": label})
 
     def test_batchnorm_fc(self):
         self.check_network_convergence(fc_with_batchnorm)
+        img = numpy.zeros(shape=[32, 784], dtype='float32')
+        label = numpy.ones(shape=[32, 1], dtype='int64')
+        self.check_network_convergence(
+            fc_with_batchnorm, feed_dict={"image": img,
+                                          "label": label})
 
 
 class TestResnet(TestParallelExecutorBase):
@@ -262,10 +291,10 @@ class TestResnet(TestParallelExecutorBase):
 
     def test_resnet(self):
         import functools
-        batch_size = 4
+        batch_size = 2
         self.check_network_convergence(
             functools.partial(
-                SE_ResNeXt152, batch_size=batch_size),
+                SE_ResNeXt50Small, batch_size=batch_size),
             iter=20,
             batch_size=batch_size)
 
@@ -394,7 +423,8 @@ def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
 import transformer_model
 
 
-def transformer():
+def transformer(use_feed):
+    assert not use_feed, "transfomer doesn't support feed yet"
     return transformer_model.transformer(
         ModelHyperParams.src_vocab_size + 1,
         ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
@@ -427,3 +457,41 @@ class TestTransformer(TestParallelExecutorBase):
     @unittest.skip("transformer is buggy in multi gpu")
     def test_main(self):
         self.check_network_convergence(transformer)
+
+
+class ParallelExecutorTestingDuringTraining(unittest.TestCase):
+    def test_parallel_testing(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = simple_fc_net(True)
+            test_program = main.clone(for_test=True)
+
+            opt = fluid.optimizer.SGD(learning_rate=0.0001)
+            opt.minimize(loss)
+
+            batch_size = 32
+            image = numpy.random.normal(size=(batch_size,
+                                              784)).astype('float32')
+            label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64")
+
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup)
+            feed_dict = {'image': image, 'label': label}
+
+            train_exe = fluid.ParallelExecutor(
+                use_cuda=True, loss_name=loss.name, main_program=main)
+
+            test_exe = fluid.ParallelExecutor(
+                use_cuda=True,
+                main_program=test_program,
+                share_vars_from=train_exe)
+
+            for i in xrange(5):
+                test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+                test_loss = numpy.array(test_loss)
+
+                train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
+                train_loss = numpy.array(train_loss)
+                self.assertTrue(numpy.allclose(train_loss, test_loss))
diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
index c21138c13e6753f9dfcbd7d439269f7cf9a04f23..bcbc02a2baa46b9ab583ecf3006bd3262e6038fd 100644
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
@@ -28,7 +28,6 @@ class TestPriorBoxOp(OpTest):
 
         self.attrs = {
             'min_sizes': self.min_sizes,
-            'max_sizes': self.max_sizes,
             'aspect_ratios': self.aspect_ratios,
             'variances': self.variances,
             'flip': self.flip,
@@ -37,25 +36,28 @@ class TestPriorBoxOp(OpTest):
             'step_h': self.step_h,
             'offset': self.offset
         }
+        if len(self.max_sizes) > 0:
+            self.attrs['max_sizes'] = self.max_sizes
 
         self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
 
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        return
-
     def setUp(self):
         self.op_type = "prior_box"
         self.set_data()
 
+    def set_max_sizes(self):
+        max_sizes = [5, 10]
+        self.max_sizes = np.array(max_sizes).astype('float32').tolist()
+
     def init_test_params(self):
-        self.layer_w = 4
-        self.layer_h = 4
+        self.layer_w = 32
+        self.layer_h = 32
 
-        self.image_w = 20
-        self.image_h = 20
+        self.image_w = 40
+        self.image_h = 40
 
         self.step_w = float(self.image_w) / float(self.layer_w)
         self.step_h = float(self.image_h) / float(self.layer_h)
@@ -66,8 +68,7 @@ class TestPriorBoxOp(OpTest):
 
         self.min_sizes = [2, 4]
         self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
-        self.max_sizes = [5, 10]
-        self.max_sizes = np.array(self.max_sizes).astype('float32').tolist()
+        self.set_max_sizes()
         self.aspect_ratios = [2.0, 3.0]
         self.flip = True
         self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
@@ -79,7 +80,7 @@ class TestPriorBoxOp(OpTest):
         self.clip = True
 
         self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
-        if len(self.max_sizes) > 1:
+        if len(self.max_sizes) > 0:
             self.num_priors += len(self.max_sizes)
         self.offset = 0.5
 
@@ -105,35 +106,27 @@ class TestPriorBoxOp(OpTest):
                 idx = 0
                 for s in range(len(self.min_sizes)):
                     min_size = self.min_sizes[s]
-                    c_w = c_h = min_size / 2.
-                    out_boxes[h, w, idx, :] = [
-                        (c_x - c_w) / self.image_w, (c_y - c_h) / self.image_h,
-                        (c_x + c_w) / self.image_w, (c_y + c_h) / self.image_h
-                    ]
-                    idx += 1
-
-                    if len(self.max_sizes) > 0:
-                        max_size = self.max_sizes[s]
-                        # second prior: aspect_ratio = 1,
-                        c_w = c_h = math.sqrt(min_size * max_size) / 2
+                    # rest of priors
+                    for r in range(len(self.real_aspect_ratios)):
+                        ar = self.real_aspect_ratios[r]
+                        c_w = min_size * math.sqrt(ar) / 2
+                        c_h = (min_size / math.sqrt(ar)) / 2
                         out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
                                                    (c_y - c_h) / self.image_h,
                                                    (c_x + c_w) / self.image_w,
                                                    (c_y + c_h) / self.image_h]
                         idx += 1
 
-                    # rest of priors
-                    for r in range(len(self.real_aspect_ratios)):
-                        ar = self.real_aspect_ratios[r]
-                        if math.fabs(ar - 1.) < 1e-6:
-                            continue
-                        c_w = min_size * math.sqrt(ar) / 2
-                        c_h = (min_size / math.sqrt(ar)) / 2
+                    if len(self.max_sizes) > 0:
+                        max_size = self.max_sizes[s]
+                        # second prior: aspect_ratio = 1,
+                        c_w = c_h = math.sqrt(min_size * max_size) / 2
                         out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
                                                    (c_y - c_h) / self.image_h,
                                                    (c_x + c_w) / self.image_w,
                                                    (c_y + c_h) / self.image_h]
                         idx += 1
+
         # clip the prior's coordidate such that it is within[0, 1]
         if self.clip:
             out_boxes = np.clip(out_boxes, 0.0, 1.0)
@@ -144,5 +137,10 @@ class TestPriorBoxOp(OpTest):
         self.out_var = out_var.astype('float32')
 
 
+class TestPriorBoxOpWithMaxSize(TestPriorBoxOp):
+    def set_max_sizes(self):
+        self.max_sizes = []
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
index da85786d0c085a4e97d9ac272feed251296ad52d..f98a8bbc68a4315df3ae761f2e52b8f11cb620c6 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -14,13 +14,14 @@
 
 import unittest
 import paddle.fluid.core as core
+from paddle.fluid.framework import Program
 
 
 class TestOpDesc(unittest.TestCase):
     def test_op_desc(self):
-        prog = core.ProgramDesc()
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        block = program_desc.block(0)
         self.assertIsNotNone(block)
         op = block.append_op()
         self.assertIsNotNone(op)
@@ -66,7 +67,7 @@ class TestOpDesc(unittest.TestCase):
 
         self.assertEqual(8, len(op.attr_names()))
 
-        op.set_block_attr("block_attr", prog.block(0))
+        op.set_block_attr("block_attr", program_desc.block(0))
         self.assertEqual(0, op.block_attr("block_attr"))
 
         mul_op = block.append_op()
@@ -87,20 +88,20 @@ class TestProgramDesc(unittest.TestCase):
         del program_desc
 
     def test_append_block(self):
-        prog_desc = core.ProgramDesc()
-        self.assertIsNotNone(prog_desc)
-        block_root = prog_desc.block(0)
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        block_root = program_desc.block(0)
         self.assertIsNotNone(block_root)
         self.assertEqual(block_root.id, 0)
-        block1 = prog_desc.append_block(block_root)
-        block2 = prog_desc.append_block(block1)
+        block1 = program_desc.append_block(block_root)
+        block2 = program_desc.append_block(block1)
         self.assertIsNotNone(block1)
         self.assertEqual(block1.id, block2.parent)
         self.assertEqual(block_root.id, block1.parent)
-        block3 = prog_desc.append_block(block_root)
+        block3 = program_desc.append_block(block_root)
         self.assertEqual(block3.parent, block_root.id)
-        self.assertEqual(prog_desc.block(1).id, 1)
-        self.assertEqual(4, prog_desc.num_blocks())
+        self.assertEqual(program_desc.block(1).id, 1)
+        self.assertEqual(4, program_desc.num_blocks())
 
 
 class TestVarDesc(unittest.TestCase):
@@ -161,9 +162,9 @@ class TestVarDesc(unittest.TestCase):
 
 class TestBlockDesc(unittest.TestCase):
     def test_add_var(self):
-        prog = core.ProgramDesc()
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        block = program_desc.block(0)
         self.assertIsNotNone(block)
         var1 = block.var("var1")
         var2 = block.var("var2")
@@ -174,9 +175,9 @@ class TestBlockDesc(unittest.TestCase):
         self.assertEqual(var2_re, var2)
 
     def test_add_op(self):
-        prog = core.ProgramDesc()
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        block = program_desc.block(0)
         self.assertIsNotNone(block)
         op1 = block.append_op()
         op2 = block.append_op()
@@ -187,32 +188,46 @@ class TestBlockDesc(unittest.TestCase):
         self.assertEqual(all_ops, [op0, op1, op2])
 
     def test_remove_op(self):
-        prog = core.ProgramDesc()
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
+        program = Program()
+        program_desc = program.desc
+        self.assertIsNotNone(program_desc)
+        block = program_desc.block(0)
         self.assertIsNotNone(block)
+
+        op0 = block.append_op()
         op1 = block.append_op()
         op2 = block.append_op()
+        op0.set_type("test")
+        op1.set_type("test")
+        op2.set_type("test")
+
+        var0 = block.var("var0")
         var1 = block.var("var1")
         var2 = block.var("var2")
         var3 = block.var("var3")
         var4 = block.var("var4")
         var5 = block.var("var5")
+
+        op0.set_input("X", ["var0"])
+        op0.set_output("Y", ["var0"])
         op1.set_input("X", ["var1", "var2"])
         op1.set_output("Y", ["var3", "var4"])
         op2.set_input("X", ["var1"])
         op2.set_output("Y", ["var4", "var5"])
 
+        program.sync_with_cpp()
+
         # remove op1, its input var2 and output var3 will be removed at the same time,
         # but its input var1 and output var4 will not be removed since they are used for op2.
-        block.remove_op(0, 1)
+        block.remove_op(1, 2)
+        program.sync_with_cpp()
 
         all_ops = []
         for idx in xrange(0, block.op_size()):
             all_ops.append(block.op(idx))
-        self.assertEqual(all_ops, [op2])
+        self.assertEqual(all_ops, [op0, op2])
         all_vars = block.all_vars()
-        self.assertEqual(set(all_vars), {var1, var4, var5})
+        self.assertEqual(set(all_vars), {var0, var1, var4, var5})
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_recv_op.py
index 854238c6279528d8f3adf173140a47e233134f43..2ebceca7e4b7b824194d94180462870e6cfe6d21 100644
--- a/python/paddle/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_op.py
@@ -23,7 +23,7 @@ import time
 
 
 class TestRecvOp(unittest.TestCase):
-    def test_send(self):
+    def no_test_send(self):
         # Run init_serv in a thread
         place = fluid.CPUPlace()
         p = Process(target=self.init_serv, args=(place, ))
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 11f35c74d41146118525a5efa6c211d528e255fe..f51b5a7e9907294a5b91c920a363830d8b9a7137 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -14,15 +14,19 @@
 
 import unittest
 import numpy as np
+
 from op_test import OpTest
 
 
 class TestReshapeOp(OpTest):
     def setUp(self):
+        ori_shape = (2, 25)
+        new_shape = (5, 10)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [10 * 20]}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -31,12 +35,33 @@ class TestReshapeOp(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInfer(OpTest):
+class TestReshapeOpDimInfer1(OpTest):
     def setUp(self):
+        ori_shape = (5, 10)
+        new_shape = (5, -1, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [4, -1, 5]}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInfer2(OpTest):
+    def setUp(self):
+        ori_shape = (2, 2, 6)
+        new_shape = (2, 0, 3, -1)
+        infered_shape = (2, 2, 3, -1)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -47,10 +72,30 @@ class TestReshapeOpDimInfer(OpTest):
 
 class TestReshapeOpInplace(OpTest):
     def setUp(self):
+        ori_shape = (2, 25)
+        new_shape = (5, 10)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInferInplace1(OpTest):
+    def setUp(self):
+        ori_shape = (5, 10)
+        new_shape = (5, -1, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [10 * 20], 'inplace': True}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -59,12 +104,38 @@ class TestReshapeOpInplace(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInferInplace(OpTest):
+class TestReshapeOpDimInferInplace2(OpTest):
     def setUp(self):
+        ori_shape = (2, 2, 6)
+        new_shape = (2, 0, 3, -1)
+        infered_shape = (2, 2, 3, -1)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpWithInputShape(OpTest):
+    def setUp(self):
+        ori_shape = (6, 5)
+        new_shape = (0, -1, 5)
+        actual_shape = (2, 3, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [4, -1, 5], 'inplace': True}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {
+            "X": np.random.random(ori_shape).astype("float32"),
+            "Shape": np.array(
+                actual_shape, dtype="int32")
+        }
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -73,5 +144,5 @@ class TestReshapeOpDimInferInplace(OpTest):
         self.check_grad(["X"], "Out")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index c498b23db12cd83304f4c3a3d1f15bd68ad4f0b6..3126293f9d8e52daa866be5fc1533648a33f3363 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -97,5 +97,72 @@ class TestSparseSGDOp(unittest.TestCase):
             self.check_with_place(place)
 
 
+class TestSGDOpOptimizeSelectedRows(unittest.TestCase):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        row_width = 12
+        # create and initialize Grad Variable
+        grad_height = 10
+        grad_rows = [0, 4, 7]
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(grad_height)
+        grad_selected_rows.set_rows(grad_rows)
+        grad_array = np.ones((len(grad_rows), row_width)).astype("float32")
+        grad_array[0, 0] = 2.0
+        grad_array[2, 8] = 4.0
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(grad_array, place)
+
+        # create and initialize Param Variable
+        # create and initialize W Variable
+        param_rows = [0, 1, 2, 3, 4, 5, 6, 7]
+
+        # init Param
+        w_selected_rows = scope.var('Param').get_selected_rows()
+        w_selected_rows.set_height(len(param_rows))
+        w_selected_rows.set_rows(param_rows)
+        w_array = np.ones((len(param_rows), row_width)).astype("float32")
+        for i in range(len(param_rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+        w_before_optimize = np.array(w_tensor)
+
+        # create and initialize LeraningRate Variable
+        lr_value = 0.1
+        lr = scope.var('LearningRate').get_tensor()
+        lr_array = np.full((1), lr_value).astype("float32")
+        lr.set(lr_array, place)
+
+        # optimize with Python
+        w_after_optimize = np.copy(w_before_optimize)
+        for index, id in enumerate(grad_rows):
+            w_after_optimize[id] = w_before_optimize[
+                id] - lr_value * grad_array[index]
+
+        # create and run sgd operator
+        sgd_op = Operator(
+            "sgd",
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            LearningRate='LearningRate')
+        sgd_op.run(scope, place)
+
+        # get and compare result
+        result_array = np.array(w_tensor)
+        assert (result_array == w_after_optimize).all()
+
+    def test_sparse_parameter_sgd(self):
+        places = [core.CPUPlace()]
+        # do not support GPU kernel currently
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 33d60c7e31ce0817ad26ea1c1c974339936052d3..279f3073f73d1c36f54bb901d92441a7403ac23f 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -68,6 +68,17 @@ class TestSoftmaxCUDNNOp(TestSoftmaxOp):
         self.use_cudnn = True
 
 
+class TestSoftmaxFP16Op(TestSoftmaxOp):
+    def init_kernel_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
+
 class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
     def init_kernel_type(self):
         self.use_cudnn = True
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
old mode 100755
new mode 100644
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b059735a924d58714cd88a761eb83143f1192d6
--- /dev/null
+++ b/python/paddle/reader/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+At training and testing time, PaddlePaddle programs need to read data. To ease
+the users' work to write data reading code, we define that
+
+- A *reader* is a function that reads data (from file, network, random number
+  generator, etc) and yields data items.
+- A *reader creator* is a function that returns a reader function.
+- A *reader decorator* is a function, which accepts one or more readers, and
+  returns a reader.
+- A *batch reader* is a function that reads data (from *reader*, file, network,
+  random number generator, etc) and yields a batch of data items.
+
+#####################
+Data Reader Interface
+#####################
+
+Indeed, *data reader* doesn't have to be a function that reads and yields data
+items. It can be any function with no parameter that creates a iterable
+(anything can be used in :code:`for x in iterable`)\:
+
+..  code-block:: python
+
+    iterable = data_reader()
+
+Element produced from the iterable should be a **single** entry of data,
+**not** a mini batch. That entry of data could be a single item, or a tuple of
+items.
+Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
+/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
+array of float32, int, list of int)
+
+An example implementation for single item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image(width, height):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+An example implementation for multiple item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image_and_label(width, height, label):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+
+
+TODO(yuyang18): Should we add whole design doc here?
+"""
+
+import decorator
+from decorator import *
+
+import creator
+
+__all__ = decorator.__all__ + ['creator']
diff --git a/python/paddle/reader/creator.py b/python/paddle/reader/creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c905d959fad4e8c1a8826ce8dc60c5fa834514d
--- /dev/null
+++ b/python/paddle/reader/creator.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Creator package contains some simple reader creator, which could
+be used in user program.
+"""
+
+__all__ = ['np_array', 'text_file', 'recordio']
+
+
+def np_array(x):
+    """
+    Creates a reader that yields elements of x, if it is a
+    numpy vector. Or rows of x, if it is a numpy matrix.
+    Or any sub-hyperplane indexed by the highest dimension.
+
+    :param x: the numpy array to create reader from.
+    :returns: data reader created from x.
+    """
+
+    def reader():
+        if x.ndim < 1:
+            yield x
+
+        for e in x:
+            yield e
+
+    return reader
+
+
+def text_file(path):
+    """
+    Creates a data reader that outputs text line by line from given text file.
+    Trailing new line ('\\\\n') of each line will be removed.
+
+    :path: path of the text file.
+    :returns: data reader of text file
+    """
+
+    def reader():
+        f = open(path, "r")
+        for l in f:
+            yield l.rstrip('\n')
+        f.close()
+
+    return reader
+
+
+def recordio(paths, buf_size=100):
+    """
+    Creates a data reader from given RecordIO file paths separated by ",",
+        glob pattern is supported.
+    :path: path of recordio files, can be a string or a string list.
+    :returns: data reader of recordio files.
+    """
+
+    import recordio as rec
+    import paddle.reader.decorator as dec
+    import cPickle as pickle
+
+    def reader():
+        if isinstance(paths, basestring):
+            path = paths
+        else:
+            path = ",".join(paths)
+        f = rec.reader(path)
+        while True:
+            r = f.read()
+            if r is None:
+                break
+            yield pickle.loads(r)
+        f.close()
+
+    return dec.buffered(reader, buf_size)
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..44a6e344630bb35d28ee29078bf8727053a24bef
--- /dev/null
+++ b/python/paddle/reader/decorator.py
@@ -0,0 +1,405 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
+]
+
+from threading import Thread
+import subprocess
+
+from Queue import Queue
+import itertools
+import random
+import zlib
+
+
+def map_readers(func, *readers):
+    """
+    Creates a data reader that outputs return value of function using
+    output of each data readers as arguments.
+
+    :param func: function to use. The type of func should be (Sample) => Sample
+    :type: callable
+    :param readers: readers whose outputs will be used as arguments of func.
+    :return: the created data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        for e in itertools.imap(func, *rs):
+            yield e
+
+    return reader
+
+
+def shuffle(reader, buf_size):
+    """
+    Creates a data reader whose data output is shuffled.
+
+    Output from the iterator that created by original reader will be
+    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
+    is determined by argument buf_size.
+
+    :param reader: the original reader whose output will be shuffled.
+    :type reader: callable
+    :param buf_size: shuffle buffer size.
+    :type buf_size: int
+
+    :return: the new reader whose output is shuffled.
+    :rtype: callable
+    """
+
+    def data_reader():
+        buf = []
+        for e in reader():
+            buf.append(e)
+            if len(buf) >= buf_size:
+                random.shuffle(buf)
+                for b in buf:
+                    yield b
+                buf = []
+
+        if len(buf) > 0:
+            random.shuffle(buf)
+            for b in buf:
+                yield b
+
+    return data_reader
+
+
+def chain(*readers):
+    """
+    Creates a data reader whose output is the outputs of input data
+    readers chained together.
+
+    If input readers output following data entries:
+    [0, 0, 0]
+    [1, 1, 1]
+    [2, 2, 2]
+    The chained reader will output:
+    [0, 0, 0, 1, 1, 1, 2, 2, 2]
+
+    :param readers: input readers.
+    :return: the new data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+
+        for e in itertools.chain(*rs):
+            yield e
+
+    return reader
+
+
+class ComposeNotAligned(ValueError):
+    pass
+
+
+def compose(*readers, **kwargs):
+    """
+    Creates a data reader whose output is the combination of input readers.
+
+    If input readers output following data entries:
+    (1, 2)    3    (4, 5)
+    The composed reader will output:
+    (1, 2, 3, 4, 5)
+
+    :param readers: readers that will be composed together.
+    :param check_alignment: if True, will check if input readers are aligned
+        correctly. If False, will not check alignment and trailing outputs
+        will be discarded. Defaults to True.
+    :type check_alignment: bool
+
+    :return: the new data reader.
+
+    :raises ComposeNotAligned: outputs of readers are not aligned.
+        Will not raise when check_alignment is set to False.
+    """
+    check_alignment = kwargs.pop('check_alignment', True)
+
+    def make_tuple(x):
+        if isinstance(x, tuple):
+            return x
+        else:
+            return (x, )
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        if not check_alignment:
+            for outputs in itertools.izip(*rs):
+                yield sum(map(make_tuple, outputs), ())
+        else:
+            for outputs in itertools.izip_longest(*rs):
+                for o in outputs:
+                    if o is None:
+                        # None will be not be present if compose is aligned
+                        raise ComposeNotAligned(
+                            "outputs of readers are not aligned.")
+                yield sum(map(make_tuple, outputs), ())
+
+    return reader
+
+
+def buffered(reader, size):
+    """
+    Creates a buffered data reader.
+
+    The buffered data reader will read and save data entries into a
+    buffer. Reading from the buffered data reader will proceed as long
+    as the buffer is not empty.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param size: max buffer size.
+    :type size: int
+
+    :returns: the buffered data reader.
+    """
+
+    class EndSignal():
+        pass
+
+    end = EndSignal()
+
+    def read_worker(r, q):
+        for d in r:
+            q.put(d)
+        q.put(end)
+
+    def data_reader():
+        r = reader()
+        q = Queue(maxsize=size)
+        t = Thread(
+            target=read_worker, args=(
+                r,
+                q, ))
+        t.daemon = True
+        t.start()
+        e = q.get()
+        while e != end:
+            yield e
+            e = q.get()
+
+    return data_reader
+
+
+def firstn(reader, n):
+    """
+    Limit the max number of samples that reader could return.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param n: the max number of samples that return.
+    :type n: int
+    :return: the decorated reader.
+    :rtype: callable
+    """
+
+    # TODO(yuyang18): Check if just drop the reader, could clean the opened
+    # resource or not?
+
+    def firstn_reader():
+        for i, item in enumerate(reader()):
+            if i == n:
+                break
+            yield item
+
+    return firstn_reader
+
+
+class XmapEndSignal():
+    pass
+
+
+def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
+    """
+    Use multiprocess to map samples from reader by a mapper defined by user.
+    And this function contains a buffered decorator.
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param reader: the data reader to read from
+    :type reader: callable
+    :param process_num: process number to handle original sample
+    :type process_num: int
+    :param buffer_size: max buffer size
+    :type buffer_size: int
+    :param order: keep the order of reader
+    :type order: bool
+    :return: the decarated reader
+    :rtype: callable
+    """
+    end = XmapEndSignal()
+
+    # define a worker to read samples from reader to in_queue
+    def read_worker(reader, in_queue):
+        for i in reader():
+            in_queue.put(i)
+        in_queue.put(end)
+
+    # define a worker to read samples from reader to in_queue with order flag
+    def order_read_worker(reader, in_queue):
+        in_order = 0
+        for i in reader():
+            in_queue.put((in_order, i))
+            in_order += 1
+        in_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue
+    def handle_worker(in_queue, out_queue, mapper):
+        sample = in_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            r = mapper(sample)
+            out_queue.put(r)
+            sample = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue by order
+    def order_handle_worker(in_queue, out_queue, mapper, out_order):
+        ins = in_queue.get()
+        while not isinstance(ins, XmapEndSignal):
+            order, sample = ins
+            r = mapper(sample)
+            while order != out_order[0]:
+                pass
+            out_queue.put(r)
+            out_order[0] += 1
+            ins = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    def xreader():
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+
+        sample = out_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            yield sample
+            sample = out_queue.get()
+        finish = 1
+        while finish < process_num:
+            sample = out_queue.get()
+            if isinstance(sample, XmapEndSignal):
+                finish += 1
+            else:
+                yield sample
+
+    return xreader
+
+
+def _buf2lines(buf, line_break="\n"):
+    # FIXME: line_break should be automatically configured.
+    lines = buf.split(line_break)
+    return lines[:-1], lines[-1]
+
+
+class PipeReader:
+    """
+        PipeReader read data by stream from a command, take it's 
+        stdout into a pipe buffer and redirect it to the parser to
+        parse, then yield data as your desired format.
+
+        You can using standard linux command or call another program
+        to read data, from HDFS, Ceph, URL, AWS S3 etc:
+
+        .. code-block:: python
+           cmd = "hadoop fs -cat /path/to/some/file"
+           cmd = "cat sample_file.tar.gz"
+           cmd = "curl http://someurl"
+           cmd = "python print_s3_bucket.py"
+
+        An example:
+
+        .. code-block:: python
+    
+           def example_reader():
+               for f in myfiles:
+                   pr = PipeReader("cat %s"%f)
+                   for l in pr.get_line():
+                       sample = l.split(" ")
+                       yield sample
+    """
+
+    def __init__(self, command, bufsize=8192, file_type="plain"):
+        if not isinstance(command, str):
+            raise TypeError("left_cmd must be a string")
+        if file_type == "gzip":
+            self.dec = zlib.decompressobj(
+                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+        self.file_type = file_type
+        self.bufsize = bufsize
+        self.process = subprocess.Popen(
+            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+
+    def get_line(self, cut_lines=True, line_break="\n"):
+        """
+        :param cut_lines: cut buffer to lines
+        :type cut_lines: bool
+        :param line_break: line break of the file, like \n or \r
+        :type line_break: string
+
+        :return: one line or a buffer of bytes
+        :rtype: string
+        """
+        remained = ""
+        while True:
+            buff = self.process.stdout.read(self.bufsize)
+            if buff:
+                if self.file_type == "gzip":
+                    decomp_buff = self.dec.decompress(buff)
+                elif self.file_type == "plain":
+                    decomp_buff = buff
+                else:
+                    raise TypeError("file_type %s is not allowed" %
+                                    self.file_type)
+
+                if cut_lines:
+                    lines, remained = _buf2lines(''.join(
+                        [remained, decomp_buff]), line_break)
+                    for line in lines:
+                        yield line
+                else:
+                    yield decomp_buff
+            else:
+                break
diff --git a/python/paddle/reader/tests/CMakeLists.txt b/python/paddle/reader/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..107d5912e1567e0c8721987a281272c7feb51e63
--- /dev/null
+++ b/python/paddle/reader/tests/CMakeLists.txt
@@ -0,0 +1,2 @@
+py_test(creator_test SRCS creator_test.py)
+py_test(decorator_test SRCS decorator_test.py)
diff --git a/python/paddle/reader/tests/__init__.py b/python/paddle/reader/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eca2dce114b069bf9b455d77ce670d73b5047fd2
--- /dev/null
+++ b/python/paddle/reader/tests/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/reader/tests/creator_test.py b/python/paddle/reader/tests/creator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4238c12a74759d52eb09f31ce1126cc93dd3489
--- /dev/null
+++ b/python/paddle/reader/tests/creator_test.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright PaddlePaddle contributors. All Rights Reservedd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+import numpy as np
+import paddle.reader.creator
+
+
+class TestNumpyArray(unittest.TestCase):
+    def test_numpy_array(self):
+        l = [[1, 2, 3], [4, 5, 6]]
+        x = np.array(l, np.int32)
+        reader = paddle.reader.creator.np_array(x)
+        for idx, e in enumerate(reader()):
+            self.assertItemsEqual(e, l[idx])
+
+
+class TestTextFile(unittest.TestCase):
+    def test_text_file(self):
+        path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
+        reader = paddle.reader.creator.text_file(path)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
+
+
+class TestRecordIO(unittest.TestCase):
+    def do_test(self, path):
+        reader = paddle.reader.creator.recordio(path)
+        idx = 0
+        for e in reader():
+            if idx == 0:
+                self.assertEqual(e, (1, 2, 3))
+            elif idx == 1:
+                self.assertEqual(e, (4, 5, 6))
+            idx += 1
+        self.assertEqual(idx, 2)
+
+    def test_recordIO(self):
+        self.do_test(
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat"))
+        self.do_test([
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat")
+        ])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bee24d3b6579db5e99ec66931df201fdf9e1af07
--- /dev/null
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -0,0 +1,178 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import unittest
+
+import paddle.reader
+
+
+def reader_creator_10(dur):
+    def reader():
+        for i in range(10):
+            # this invocation helps testing paddle.reader.buffer
+            time.sleep(dur)
+            yield i
+
+    return reader
+
+
+class TestMap(unittest.TestCase):
+    def test_map(self):
+        d = {"h": 0, "i": 1}
+
+        def tokenize(x):
+            return d[x]
+
+        def read():
+            yield "h"
+            yield "i"
+
+        r = paddle.reader.map_readers(tokenize, read)
+        for i, e in enumerate(r()):
+            self.assertEqual(e, i)
+
+
+class TestBuffered(unittest.TestCase):
+    def test_read(self):
+        for size in range(20):
+            b = paddle.reader.buffered(reader_creator_10(0), size)
+            c = 0
+            for i in b():
+                self.assertEqual(i, c)
+                c += 1
+            self.assertEqual(c, 10)
+
+    def test_buffering(self):
+        # read have 30ms delay.
+        b = paddle.reader.buffered(reader_creator_10(0.03), 10)
+        last_time = time.time()
+        for idx, i in enumerate(b()):
+            elapsed_time = time.time() - last_time
+            if i == 0:
+                time.sleep(0.3)
+            else:
+                # read time should be short, meaning already buffered.
+                self.assertLess(elapsed_time, 0.05)
+            last_time = time.time()
+
+
+class TestCompose(unittest.TestCase):
+    def test_compse(self):
+        reader = paddle.reader.compose(
+            reader_creator_10(0), reader_creator_10(0))
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, (idx, idx))
+
+    def test_compose_not_aligned(self):
+        total = 0
+        reader = paddle.reader.compose(
+            paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0))
+        with self.assertRaises(paddle.reader.ComposeNotAligned):
+            for e in reader():
+                total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+    def test_compose_not_aligned_no_check(self):
+        total = 0
+        reader = paddle.reader.compose(
+            paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0),
+            check_alignment=False)
+        for e in reader():
+            total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+
+class TestChain(unittest.TestCase):
+    def test_chain(self):
+        c = paddle.reader.chain(reader_creator_10(0), reader_creator_10(0))
+        idx = 0
+        for e in c():
+            self.assertEqual(e, idx % 10)
+            idx += 1
+        self.assertEqual(idx, 20)
+
+
+class TestShuffle(unittest.TestCase):
+    def test_shuffle(self):
+        case = [(0, True), (1, True), (10, False), (100, False)]
+        a = reader_creator_10(0)
+        for size, checkEq in case:
+            s = paddle.reader.shuffle(a, size)
+            total = 0
+            for idx, e in enumerate(s()):
+                if checkEq:
+                    self.assertEqual(idx, e)
+                total += 1
+            self.assertEqual(total, 10)
+
+
+class TestXmap(unittest.TestCase):
+    def test_xmap(self):
+        def mapper(x):
+            return (x + 1)
+
+        orders = (True, False)
+        thread_nums = (1, 2, 4, 8, 16)
+        buffered_size = (1, 2, 4, 8, 16)
+        for order in orders:
+            for tNum in thread_nums:
+                for size in buffered_size:
+                    reader = paddle.reader.xmap_readers(mapper,
+                                                        reader_creator_10(0),
+                                                        tNum, size, order)
+                    for n in xrange(3):
+                        result = []
+                        for i in reader():
+                            result.append(i)
+                        if not order:
+                            result.sort()
+                        for idx, e in enumerate(result):
+                            self.assertEqual(e, mapper(idx))
+
+
+class TestPipeReader(unittest.TestCase):
+    def test_pipe_reader(self):
+        def example_reader(myfiles):
+            for f in myfiles:
+                pr = paddle.reader.PipeReader("cat %s" % f, bufsize=128)
+                for l in pr.get_line():
+                    yield l
+
+        import tempfile
+
+        records = [str(i) for i in xrange(5)]
+        temp = tempfile.NamedTemporaryFile()
+        try:
+            with open(temp.name, 'w') as f:
+                for r in records:
+                    f.write('%s\n' % r)
+
+            result = []
+            for r in example_reader([temp.name]):
+                result.append(r)
+
+            for idx, e in enumerate(records):
+                self.assertEqual(e, result[idx])
+        finally:
+            # delete the temporary file
+            temp.close()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/reader/tests/test_data_creator.txt b/python/paddle/reader/tests/test_data_creator.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a2a8d47d43868d369083808497697da79e620e31
--- /dev/null
+++ b/python/paddle/reader/tests/test_data_creator.txt
@@ -0,0 +1,3 @@
+0 1
+2 3
+4 5
diff --git a/python/paddle/reader/tests/test_reader_recordio.dat b/python/paddle/reader/tests/test_reader_recordio.dat
new file mode 100644
index 0000000000000000000000000000000000000000..a99a35bb829e066c4845d0b85b96cd1eb3a12491
Binary files /dev/null and b/python/paddle/reader/tests/test_reader_recordio.dat differ
diff --git a/python/paddle/reader/tests/test_recordio_creator.dat b/python/paddle/reader/tests/test_recordio_creator.dat
new file mode 100644
index 0000000000000000000000000000000000000000..17aa89b6796184407e83246d3f342a55a66b4a69
Binary files /dev/null and b/python/paddle/reader/tests/test_recordio_creator.dat differ
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 186b91c226accbe1c2d5465d6244b9438eec9979..460eb3b3491a0626eb6ecbf89132e24177a2adaa 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -471,6 +471,7 @@ class Input(Cfg):
             maxout=None,
             spp=None,
             pad=None,
+            upsample=None,
             format=None,
             nnz=None,
             is_static=None,
@@ -983,6 +984,13 @@ class Pad(Cfg):
         self.add_keys(locals())
 
 
+@config_class
+class Upsample(Cfg):
+    def __init__(self, scale, scale_y, pad_out_x, pad_out_y, upsample_size,
+                 upsample_size_y):
+        self.add_keys(locals())
+
+
 @config_class
 class Norm(Cfg):
     def __init__(self,
@@ -2380,6 +2388,46 @@ class SpatialPyramidPoolLayer(LayerBase):
             self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels)
 
 
+@config_layer('upsample')
+class UpsampleLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        super(UpsampleLayer, self).__init__(
+            name, 'upsample', 0, inputs=inputs, **xargs)
+
+        input_layer = self.get_input_layer(0)
+        image_conf = self.config.inputs[0].upsample_conf.image_conf
+        image_conf.img_size = input_layer.width
+        image_conf.img_size_y = input_layer.height
+        image_conf.channels = input_layer.size / (input_layer.width *
+                                                  input_layer.height)
+
+        upsample = self.inputs[0].upsample
+        output_x = 0
+        output_y = 0
+        output_size = 0
+
+        if upsample.scale:
+            self.config.inputs[0].upsample_conf.scale = upsample.scale
+            self.config.inputs[0].upsample_conf.scale_y = upsample.scale_y
+            output_x = input_layer.width * upsample.scale
+            output_y = input_layer.height * upsample.scale_y
+        self.config.inputs[0].upsample_conf.pad_out_x = upsample.pad_out_x
+        self.config.inputs[0].upsample_conf.pad_out_y = upsample.pad_out_y
+        if upsample.upsample_size:
+            self.config.inputs[
+                0].upsample_conf.upsample_size = upsample.upsample_size
+            self.config.inputs[
+                0].upsample_conf.upsample_size_y = upsample.upsample_size_y
+            output_x = upsample.upsample_size
+            output_y = upsample.upsample_size_y
+
+        output_size = image_conf.channels * output_x * output_y
+
+        self.set_layer_height_width(output_y, output_x)
+        self.set_layer_depth(input_layer.depth)
+        self.set_layer_size(output_size)
+
+
 @config_layer('pad')
 class PadLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 3684d1e8f73a21d9c6f2a5985f8b40ed6984057b..ebc31b23e0f5504b4bebccabe996b054c7fbce3b 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -148,6 +148,7 @@ __all__ = [
     'resize_layer',
     'sub_seq_layer',
     'scale_sub_region_layer',
+    'upsample_layer',
     'factorization_machine',
 ]
 
@@ -166,6 +167,7 @@ class LayerType(object):
     SEQUENCE_RESHAPE = 'seqreshape'
     POOLING_MAX = 'max'
     POOLING_AVG = 'average'
+    UPSAMPLE_LAYER = 'upsample'
     FC_LAYER = 'fc'
     COST = 'cost'
     COSINE_SIM_VEC = 'cos_vm'
@@ -3014,6 +3016,83 @@ def img_pool3d_layer(input,
         size=l.config.size)
 
 
+@wrap_name_default("upsample")
+@layer_support()
+def upsample_layer(input,
+                   name=None,
+                   scale=None,
+                   scale_y=None,
+                   upsample_size=None,
+                   upsample_size_y=None,
+                   pad_out_x=False,
+                   pad_out_y=False,
+                   layer_attr=None):
+    """
+    The DePooling process.
+    Inputs should be a list of length 2. The first input is a layer,
+    and the second input should be the MaxWithMaskPoolingLayer
+
+    The example usage is:
+
+    ..  code-block:: python
+        pool1 = paddle.v2.layer.img_pool(input=input, pool_size=2, stride=2,
+                                        pool_type=paddle.pooling.MaxWithMask())
+        upsample = paddle.v2.layer.upsample(input=[layer1, pool1])
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: contains an input layer and a MaxWithMaskPoolingLayer
+    :type input: list | tuple | collections.Sequence
+    :param scale: outputSize =  scale * inputSize
+    :type scale: int | list | tuple | .
+    :param scale_y: scale_y will be equal to scale, if it's value is None, 
+    :type scale: int | None. 
+    :param upsample_size: specify the outputSize.
+    :type upsample_size: int | list | tuple.
+    :param upsample_size_y: specify the y dimension outputSize.
+    :type upsample_size_y: int.
+    :param pad_out_x: specify exact x dimension size. This parameter only works when scale is 2
+    :type pad_out_x: bool.
+    :param pad_out_y: specify exact y dimension size. This parameter only works when scale is 2
+    :type pad_out_y: bool.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert (scale is not None) or (upsample_size is not None), \
+            'scale or upsample_size, there must be one to be designated'
+
+    assert len(input) == 2, 'layer input size must be 2'
+
+    assert input[1].layer_type == LayerType.POOL_LAYER, \
+            'the second input should be the MaxPoolWithMaskLayer'
+
+    scale_y = scale \
+            if scale is not None else scale_y
+    upsample_size_y = upsample_size  \
+            if upsample_size is not None else upsample_size_y
+
+    layer_type = LayerType.UPSAMPLE_LAYER
+
+    layer = Layer(
+        name=name,
+        type=layer_type,
+        inputs=[
+            Input(
+                input[0].name,
+                upsample=Upsample(scale, scale_y, pad_out_x, pad_out_y,
+                                  upsample_size, upsample_size_y)),
+            Input(input[1].name)
+        ],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+
+    sz = layer.config.size
+
+    return LayerOutput(name, layer_type=layer_type, parents=input, size=sz)
+
+
 @wrap_name_default("spp")
 @layer_support()
 def spp_layer(input,
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index 580aef935b5cec385a88fb0b4f5b9a5ddeddb40c..30e0b9906c406d846d4b086a1a1c89587394afea 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -1,17 +1,17 @@
 #################### test_config_parser #########################
 add_test(NAME layers_test
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
         ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/layers_test.py
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 
 add_test(NAME test_reset_hook
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
         ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 
 add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
 add_test(NAME test_layerHelpers
-  COMMAND
-  ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
+  ${PADDLE_BINARY_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
   ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
 )
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index 8a318879630cd491573afcaf798dda2ca75e335d..44a75a60cc78e85f85d111a911999b7812db0f49 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -2,7 +2,6 @@
 
 set -e
 cd `dirname $0`
-export PYTHONPATH=$PWD/../../../../
 
 protostr=$PWD/protostr
 . file_list.sh
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index c1acbecd9c313b02d6d33d2d04fd33fc1a8b026e..38056fe0a9496bcb5de76634bbab267e324dc2a4 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -36,7 +36,7 @@ __all__ = [
     'cifar',
     'movielens',
     'conll05',
-    'sentiment'
+    'sentiment',
     'uci_housing',
     'wmt14',
     'wmt16',
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 52f5b947fdec55eea45b9d34eddd576c981fa97c..14b64742fd09bf6c197c5d1aa2354271293df239 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -15,7 +15,7 @@
 import numpy
 import collections
 import topology
-import minibatch
+import paddle
 import cPickle
 
 __all__ = ['infer', 'Inference']
@@ -80,7 +80,7 @@ class Inference(object):
             for each_sample in input:
                 yield each_sample
 
-        reader = minibatch.batch(__reader_impl__, batch_size=batch_size)
+        reader = paddle.batch(__reader_impl__, batch_size=batch_size)
 
         self.__gradient_machine__.start()
         for data_batch in reader():
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 6a2bb8d337b7667aa2b1e3ef0815bb80f6e38d6a..a188a03eb3698c972de92c9807f1bdb71a249330 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -20,7 +20,7 @@ The primary usage shows below.
 
 ..  code-block:: python
 
-    import paddle.v2 as paddle
+    import paddle
 
     img = paddle.layer.data(name='img', type=paddle.data_type.dense_vector(784))
     hidden = paddle.layer.fc(input=img, size=200)
diff --git a/python/setup.py.in b/python/setup.py.in
index 831d173d424b8c663f728af748ad1942bb20a418..5e7096e225e08d19e89051603bbc07eff945c78a 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -58,11 +58,13 @@ def mkl():
             'istaged': ISTAGED,
             'with_mkl': '@WITH_MKL@'})
 
-write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
+write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
 
 
 packages=['paddle',
           'paddle.utils',
+          'paddle.dataset',
+          'paddle.reader',
           'paddle.fluid',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
@@ -73,10 +75,10 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
                'paddle.trainer',
                'paddle.trainer_config_helpers',
                'paddle.v2',
-               'paddle.v2.dataset',
-               'paddle.v2.reader',
                'paddle.v2.master',
                'paddle.v2.plot',
+               'paddle.v2.reader',
+               'paddle.v2.dataset',
                'py_paddle']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
@@ -105,9 +107,10 @@ package_dir={
     # So that package points to other directory.
     'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
     'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
+    'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
 }
 if '${WITH_FLUID_ONLY}'== 'OFF':
-    package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle'
+    package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
     
 
 paddle_rt_lib_dir = 'lib'
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
new file mode 100755
index 0000000000000000000000000000000000000000..94d1e23ce716f7f1d723bad5f1f4c60030f19eb7
--- /dev/null
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+TOTAL_ERRORS=0
+
+# The trick to remove deleted files: https://stackoverflow.com/a/2413151
+for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
+    cpplint $file;
+    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+done
+
+exit $TOTAL_ERRORS
+

Trainer Counter	1	10	20	30	40	50	60	70	80	90	100
PaddlePaddle Fluid	-	-	-	-	-	-	-	-	-	-	-
PaddlePaddle v2	-	-	-	-	-	-	-	-	-	-	-
TensorFlow	-	-	-	-	-	-	-	-	-	-	-
Batch Size	32	64	128	256
PaddlePaddle Fluid	15.44	16.32	16.74	16.79
PaddlePaddle v2	15.97	17.04	17.60	17.83
TensorFlow	9.09	9.10	9.24	8.66
Batch Size	32	64	128	256
PaddlePaddle Fluid	190.20	222.15	247.40	258.18
PaddlePaddle v2	170.96	233.71	256.14	329.23
TensorFlow	-	-	-	-
Trainer Count	20	40	80	100
PaddlePaddle Fluid	263.29 (78.64%)	518.80 (77.47%)	836.26 (62.44%)	1019.29 (60.89%)
PaddlePaddle v2 (need more tests)	326.85 (92.85%)	534.58 (75.93%)	853.30 (60.60%)	1041.99 (59.20%)
TensorFlow	-	-	-	-
PServer Count	3	6	10	20
PaddlePaddle Fluid(should fix in next PR)	589.1	592.6	656.4	655.8
PaddlePaddle v2 (need more tests)	593.4	791.3	729.7	821.7
TensorFlow	-	-	-	-
C++	CUDA C++	Go
cc_library	nv_library	go_library
cc_binary	nv_binary	go_binary
cc_test	nv_test	go_test
programming languages	PaddlePaddle
for, while loop	RNN, WhileOp
if, if-else, switch	IfElseOp, SwitchOp
sequential execution	a sequence of layers
programming languages	PaddlePaddle
stack	scope hierarchy
stack frame	scope
push at entering block	push at entering block
pop at leaving block	destroy when minibatch completes
C++ functions/functors	mul	add
C++ operator class	mulOp	addOp	FCOp
Python binding	operator.mul	operator.add	operator.fc
Python function				layer.fc
	TensorFlow	PaddlePaddle
RNN	Support	Support
recursive RNN	Support	Support
padding zeros	Must	No need
blob data type	Tensor	LoDTensor