diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 89c620bb2f7ef634fa80b64eec7037e8cb9a190c..6140340890c0e5025eb08209e8ea78df918b4dc0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,4 @@
+repos:
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
     sha: v1.0.1
     hooks:
@@ -25,6 +26,14 @@
         entry: bash ./.clang_format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
+-   repo: local
+    hooks:
+    -   id: cpplint-cpp-source
+        name: cpplint
+        description: Check C++ code style using cpplint.py.
+        entry: bash ./tools/codestyle/cpplint_pre_commit.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
     sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:
diff --git a/.travis.yml b/.travis.yml
index bf6a41d13c4eabc2d8543ab821ce0ff747a061df..929c847bd36d64e79a199b2634ebf68c3225429b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,7 +34,7 @@ addons:
       - automake
       - libtool
       - ccache
-  ssh_known_hosts: 52.76.173.135
+  ssh_known_hosts: 13.229.163.131
 before_install:
   - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1e11f86d0ee836f65e69c8398fb26c3b6a1070f6..c649aafeddaf9f28c213d086236c3779d3137d92 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,8 +53,7 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
-# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. 
-option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         OFF)
+option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
@@ -109,7 +108,7 @@ if (WITH_C_API AND WITH_PYTHON)
 endif()
 
 if (WITH_C_API)
-  set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
+  set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
 endif()
 
 if(MOBILE_INFERENCE)
@@ -147,6 +146,7 @@ include(external/cares)
 include(external/grpc)
 include(external/snappy)    # download snappy
 include(external/snappystream)
+include(external/threadpool)
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
index b619613ea7a5b6e940ec735314e8e47338b2c600..64816098a524f064ec12474a736cd4c721227a70 100644
--- a/benchmark/cluster/README.md
+++ b/benchmark/cluster/README.md
@@ -36,11 +36,41 @@
 - Trainer Count: 100
 - Metrics: mini-batch / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>
 
 ### Measure the Performance for Different PServer Count
 
@@ -48,11 +78,41 @@
 - Batch Size: 64
 - Metrics: mini-batch / sec
 
-| PServer Count | 10 | 20 | 40 | 60 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+<table>
+<thead>
+<tr>
+<th>PServer Count  </th>
+<th>10</th>
+<th>20</th>
+<th>40 </th>
+<th>60</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>
 
 ### Measure Parallel Efficiency By Increasing Trainer Count
 
@@ -67,11 +127,69 @@ The parallel efficiency is:
 
 $E = \div(S, N)$
 
-| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
-| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
-| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>Trainer Counter  </th>
+<th>1</th>
+<th>10</th>
+<th>20 </th>
+<th>30</th>
+<th>40</th>
+<th>50</th>
+<th>60 </th>
+<th>70</th>
+<th>80</th>
+<th>90</th>
+<th>100 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>
+
 
 ## Reproduce the benchmark
 
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index cd681a1a282d9a26eac1c267bfa26967f8c3c9fd..d56a912b9b03986e32693363f82df05a34b779e9 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -16,11 +16,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 
 - Metrics: samples / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
-| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
-| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 15.44 </td>
+<td> 16.32 </td>
+<td> 16.74 </td>
+<td> 16.79 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td> 15.97 </td>
+<td> 17.04 </td>
+<td> 17.60 </td>
+<td> 17.83 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> 9.09 </td>
+<td> 9.10 </td>
+<td> 9.24 </td>
+<td> 8.66 </td>
+</tr>
+</tbody>
+</table>
+
 
 ### Different Batch Size
 
@@ -28,12 +58,40 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Trainer Count: 20
 - Metrics: samples / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
-| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
-| TensorFlow | - | - | - | - |
-
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 190.20 </td>
+<td> 222.15 </td>
+<td> 247.40 </td>
+<td> 258.18 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td> 170.96 </td>
+<td> 233.71 </td>
+<td> 256.14 </td>
+<td> 329.23 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
 
 ### Accelerate Rate
 
@@ -41,11 +99,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples / sec
 
-| Trainer Count | 20 | 40 | 80 | 100 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
-| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
-| TensorFlow | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>Trainer Count </th>
+<th>20</th>
+<th>40</th>
+<th>80</th>
+<th>100</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 263.29 (78.64%) </td>
+<td> 518.80 (77.47%) </td>
+<td> 836.26 (62.44%) </td>
+<td> 1019.29 (60.89%) </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2 (need more tests)   </td>
+<td> 326.85 (92.85%) </td>
+<td> 534.58 (75.93%) </td>
+<td> 853.30 (60.60%) </td>
+<td> 1041.99 (59.20%) </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
+
 
 ### Different Pserver Count
 
@@ -53,11 +141,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples/ sec
 
-| PServer Count | 3 | 6 |10 | 20 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
-| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
-| TensorFlow | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>PServer Count </th>
+<th>3</th>
+<th>6</th>
+<th>10</th>
+<th>20</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid(should fix in next PR) </td>
+<td> 589.1 </td>
+<td> 592.6 </td>
+<td> 656.4 </td>
+<td> 655.8 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2 (need more tests)   </td>
+<td> 593.4 </td>
+<td> 791.3 </td>
+<td> 729.7 </td>
+<td> 821.7 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
+
 
 *The performance gap between Fuild and v2 comes from the network interference.*
 
diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc31d098328bc237c018ebf8f158bdab5c37bff1
--- /dev/null
+++ b/benchmark/fluid/machine_translation.py
@@ -0,0 +1,349 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""seq2seq model for fluid."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import distutils.util
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.executor import Executor
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--embedding_dim",
+    type=int,
+    default=512,
+    help="The dimension of embedding table. (default: %(default)d)")
+parser.add_argument(
+    "--encoder_size",
+    type=int,
+    default=512,
+    help="The size of encoder bi-rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--decoder_size",
+    type=int,
+    default=512,
+    help="The size of decoder rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--batch_size",
+    type=int,
+    default=16,
+    help="The sequence number of a mini-batch data. (default: %(default)d)")
+parser.add_argument(
+    "--dict_size",
+    type=int,
+    default=30000,
+    help="The dictionary capacity. Dictionaries of source sequence and "
+    "target dictionary have same capacity. (default: %(default)d)")
+parser.add_argument(
+    "--pass_num",
+    type=int,
+    default=2,
+    help="The pass number to train. (default: %(default)d)")
+parser.add_argument(
+    "--learning_rate",
+    type=float,
+    default=0.0002,
+    help="Learning rate used to train the model. (default: %(default)f)")
+parser.add_argument(
+    "--infer_only", action='store_true', help="If set, run forward only.")
+parser.add_argument(
+    "--beam_size",
+    type=int,
+    default=3,
+    help="The width for beam searching. (default: %(default)d)")
+parser.add_argument(
+    "--use_gpu",
+    type=distutils.util.strtobool,
+    default=True,
+    help="Whether to use gpu. (default: %(default)d)")
+parser.add_argument(
+    "--max_length",
+    type=int,
+    default=250,
+    help="The maximum length of sequence when doing generation. "
+    "(default: %(default)d)")
+
+
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+    def linear(inputs):
+        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+    cell_t = fluid.layers.sums(input=[
+        fluid.layers.elementwise_mul(
+            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+                x=input_gate, y=cell_tilde)
+    ])
+
+    hidden_t = fluid.layers.elementwise_mul(
+        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+    return hidden_t, cell_t
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size, max_length):
+    """Construct a seq2seq network."""
+
+    def bi_lstm_encoder(input_seq, gate_size):
+        # Linear transformation part for input gate, output gate, forget gate
+        # and cell activation vectors need be done outside of dynamic_lstm.
+        # So the output size is 4 times of gate_size.
+        input_forward_proj = fluid.layers.fc(input=input_seq,
+                                             size=gate_size * 4,
+                                             act=None,
+                                             bias_attr=False)
+        forward, _ = fluid.layers.dynamic_lstm(
+            input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
+        input_reversed_proj = fluid.layers.fc(input=input_seq,
+                                              size=gate_size * 4,
+                                              act=None,
+                                              bias_attr=False)
+        reversed, _ = fluid.layers.dynamic_lstm(
+            input=input_reversed_proj,
+            size=gate_size * 4,
+            is_reverse=True,
+            use_peepholes=False)
+        return forward, reversed
+
+    src_word_idx = fluid.layers.data(
+        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    src_embedding = fluid.layers.embedding(
+        input=src_word_idx,
+        size=[source_dict_dim, embedding_dim],
+        dtype='float32')
+
+    src_forward, src_reversed = bi_lstm_encoder(
+        input_seq=src_embedding, gate_size=encoder_size)
+
+    encoded_vector = fluid.layers.concat(
+        input=[src_forward, src_reversed], axis=1)
+
+    encoded_proj = fluid.layers.fc(input=encoded_vector,
+                                   size=decoder_size,
+                                   bias_attr=False)
+
+    backward_first = fluid.layers.sequence_pool(
+        input=src_reversed, pool_type='first')
+
+    decoder_boot = fluid.layers.fc(input=backward_first,
+                                   size=decoder_size,
+                                   bias_attr=False,
+                                   act='tanh')
+
+    def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
+                                    decoder_boot, decoder_size):
+        def simple_attention(encoder_vec, encoder_proj, decoder_state):
+            decoder_state_proj = fluid.layers.fc(input=decoder_state,
+                                                 size=decoder_size,
+                                                 bias_attr=False)
+            decoder_state_expand = fluid.layers.sequence_expand(
+                x=decoder_state_proj, y=encoder_proj)
+            concated = fluid.layers.concat(
+                input=[encoder_proj, decoder_state_expand], axis=1)
+            attention_weights = fluid.layers.fc(input=concated,
+                                                size=1,
+                                                act='tanh',
+                                                bias_attr=False)
+            attention_weights = fluid.layers.sequence_softmax(
+                input=attention_weights)
+            weigths_reshape = fluid.layers.reshape(
+                x=attention_weights, shape=[-1])
+            scaled = fluid.layers.elementwise_mul(
+                x=encoder_vec, y=weigths_reshape, axis=0)
+            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+            return context
+
+        rnn = fluid.layers.DynamicRNN()
+
+        cell_init = fluid.layers.fill_constant_batch_size_like(
+            input=decoder_boot,
+            value=0.0,
+            shape=[-1, decoder_size],
+            dtype='float32')
+        cell_init.stop_gradient = False
+
+        with rnn.block():
+            current_word = rnn.step_input(target_embedding)
+            encoder_vec = rnn.static_input(encoder_vec)
+            encoder_proj = rnn.static_input(encoder_proj)
+            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+            cell_mem = rnn.memory(init=cell_init)
+            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
+            decoder_inputs = fluid.layers.concat(
+                input=[context, current_word], axis=1)
+            h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
+            rnn.update_memory(hidden_mem, h)
+            rnn.update_memory(cell_mem, c)
+            out = fluid.layers.fc(input=h,
+                                  size=target_dict_dim,
+                                  bias_attr=True,
+                                  act='softmax')
+            rnn.output(out)
+        return rnn()
+
+    if not is_generating:
+        trg_word_idx = fluid.layers.data(
+            name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+        trg_embedding = fluid.layers.embedding(
+            input=trg_word_idx,
+            size=[target_dict_dim, embedding_dim],
+            dtype='float32')
+
+        prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
+                                                 encoded_proj, decoder_boot,
+                                                 decoder_size)
+        label = fluid.layers.data(
+            name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
+
+        return avg_cost, feeding_list
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    lod_t = core.LoDTensor()
+    lod_t.set(flattened_data, place)
+    lod_t.set_lod([lod])
+    return lod_t, lod[-1]
+
+
+def lodtensor_to_ndarray(lod_tensor):
+    dims = lod_tensor.get_dims()
+    ndarray = np.zeros(shape=dims).astype('float32')
+    for i in xrange(np.product(dims)):
+        ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+    return ndarray
+
+
+def train():
+    avg_cost, feeding_list = seq_to_seq_net(
+        args.embedding_dim,
+        args.encoder_size,
+        args.decoder_size,
+        args.dict_size,
+        args.dict_size,
+        False,
+        beam_size=args.beam_size,
+        max_length=args.max_length)
+
+    # clone from default main program
+    inference_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    train_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    def do_validation():
+        total_loss = 0.0
+        count = 0
+        for batch_id, data in enumerate(test_batch_generator()):
+            src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0]
+            trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0]
+            lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0]
+
+            fetch_outs = exe.run(inference_program,
+                                 feed={
+                                     feeding_list[0]: src_seq,
+                                     feeding_list[1]: trg_seq,
+                                     feeding_list[2]: lbl_seq
+                                 },
+                                 fetch_list=[avg_cost],
+                                 return_numpy=False)
+
+            total_loss += lodtensor_to_ndarray(fetch_outs[0])[0]
+            count += 1
+
+        return total_loss / count
+
+    for pass_id in xrange(args.pass_num):
+        pass_start_time = time.time()
+        words_seen = 0
+        for batch_id, data in enumerate(train_batch_generator()):
+            src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
+            words_seen += word_num
+            trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
+            words_seen += word_num
+            lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
+
+            fetch_outs = exe.run(framework.default_main_program(),
+                                 feed={
+                                     feeding_list[0]: src_seq,
+                                     feeding_list[1]: trg_seq,
+                                     feeding_list[2]: lbl_seq
+                                 },
+                                 fetch_list=[avg_cost])
+
+            avg_cost_val = np.array(fetch_outs[0])
+            print('pass_id=%d, batch_id=%d, train_loss: %f' %
+                  (pass_id, batch_id, avg_cost_val))
+
+        pass_end_time = time.time()
+        test_loss = do_validation()
+        time_consumed = pass_end_time - pass_start_time
+        words_per_sec = words_seen / time_consumed
+        print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
+              (pass_id, test_loss, words_per_sec, time_consumed))
+
+
+def infer():
+    pass
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    if args.infer_only:
+        infer()
+    else:
+        train()
diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f7afaeb11447d936b65a1d83701b0176ecbc111
--- /dev/null
+++ b/benchmark/fluid/mnist.py
@@ -0,0 +1,205 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+SEED = 1
+DTYPE = "float32"
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("mnist model benchmark.")
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--iterations', type=int, default=35, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=5, help='The number of passes.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    test_pass_acc = fluid.average.WeightedAverage()
+    for batch_id, data in enumerate(test_reader()):
+        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
+                                data)).astype(DTYPE)
+        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+        y_data = y_data.reshape([len(y_data), 1])
+
+        acc, weight = exe.run(inference_program,
+                              feed={"pixel": img_data,
+                                    "label": y_data},
+                              fetch_list=[batch_acc, batch_size_tensor])
+        test_pass_acc.add(value=acc, weight=weight)
+        pass_acc = test_pass_acc.eval()
+    return pass_acc
+
+
+def run_benchmark(model, args):
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+    start_time = time.time()
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+    opt.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+
+    accuracy = fluid.average.WeightedAverage()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        pass_start = time.time()
+        for batch_id, data in enumerate(train_reader()):
+            img_data = np.array(
+                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([len(y_data), 1])
+
+            start = time.time()
+            outs = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+            accuracy.add(value=outs[1], weight=outs[2])
+            end = time.time()
+            loss = np.array(outs[0])
+            acc = np.array(outs[1])
+            print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
+                  (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+
+        pass_end = time.time()
+
+        train_avg_acc = accuracy.eval()
+        test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
+                                 inference_program)
+
+        print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
+              (pass_id, train_avg_acc, test_avg_acc,
+               (pass_end - pass_start) / 1000))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if args.use_nvprof and args.device == 'GPU':
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            run_benchmark(cnn_model, args)
+    else:
+        run_benchmark(cnn_model, args)
diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0f1db979fa7fb640679beacafd66dfbe1f62ab8
--- /dev/null
+++ b/benchmark/fluid/resnet.py
@@ -0,0 +1,323 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import functools
+import numpy as np
+import time
+
+import cProfile, pstats, StringIO
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Convolution model benchmark.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=['resnet_imagenet', 'resnet_cifar10'],
+        default='resnet_imagenet',
+        help='The model architecture.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='use real data or fake data')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride):
+    short = shortcut(input, ch_out, stride)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride):
+    short = shortcut(input, ch_out * 4, stride)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+
+
+def run_benchmark(model, args):
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+    else:
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+
+    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    predict = model(input, class_dim)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+    opts = optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    def test(exe):
+        test_accuracy = fluid.average.WeightedAverage()
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(dshape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            acc, weight = exe.run(inference_program,
+                                  feed={"data": img_data,
+                                        "label": y_data},
+                                  fetch_list=[batch_acc, batch_size_tensor])
+            test_accuracy.add(value=acc, weight=weight)
+
+        return test_accuracy.eval()
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    accuracy = fluid.average.WeightedAverage()
+    if args.use_fake_data:
+        data = train_reader().next()
+        image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
+            'float32')
+        label = np.array(map(lambda x: x[1], data)).astype('int64')
+        label = label.reshape([-1, 1])
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            if not args.use_fake_data:
+                image = np.array(map(lambda x: x[0].reshape(dshape),
+                                     data)).astype('float32')
+                label = np.array(map(lambda x: x[1], data)).astype('int64')
+                label = label.reshape([-1, 1])
+            loss, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={'data': image,
+                      'label': label},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+            iters += 1
+            num_samples += label[0]
+            accuracy.add(value=acc, weight=weight)
+            train_losses.append(loss)
+            train_accs.append(acc)
+            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
+                  (pass_id, iters, loss, acc))
+        pass_train_acc = accuracy.eval()
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        train_elapsed = time.time() - start_time
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+
+        examples_per_sec = num_samples / train_elapsed
+
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+
+    if args.use_cprof:
+        pr.disable()
+        s = StringIO.StringIO()
+        sortby = 'cumulative'
+        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
+        ps.print_stats()
+        print(s.getvalue())
+
+
+if __name__ == '__main__':
+    model_map = {
+        'resnet_imagenet': resnet_imagenet,
+        'resnet_cifar10': resnet_cifar10
+    }
+    args = parse_args()
+    print_arguments(args)
+    if args.data_format == 'NHWC':
+        raise ValueError('Only support NCHW data_format now.')
+    if args.use_nvprof and args.device == 'GPU':
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            run_benchmark(model_map[args.model], args)
+    else:
+        run_benchmark(model_map[args.model], args)
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..663e2efd5392a6cd1a71f51fa0d017070b489341
--- /dev/null
+++ b/benchmark/fluid/run.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# This script benchmarking the PaddlePaddle Fluid on
+# single thread single GPU.
+export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
+
+# disable openmp and mkl parallel
+#https://github.com/PaddlePaddle/Paddle/issues/7199
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+        export OMP_DYNAMIC="FALSE"
+    fi
+else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+fi
+# disable multi-gpu if have more than one
+export CUDA_VISIBLE_DEVICES=0
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
+
+
+# vgg16
+# cifar10 gpu cifar10 128
+FLAGS_benchmark=true python fluid/vgg.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30  \
+               2>&1 > vgg16_gpu_128.log
+
+# resnet50
+# resnet50 gpu cifar10 128
+FLAGS_benchmark=true python fluid/resnet.py \
+               --device=GPU \
+               --batch_size=128 \
+               --data_set=cifar10 \
+               --model=resnet_cifar10 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 > resnet50_gpu_128.log
+
+# lstm
diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e063549e0239abf9d946ed8735f0306203509d0
--- /dev/null
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@@ -0,0 +1,209 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import cPickle
+import os
+import random
+import time
+
+import numpy
+import paddle.v2 as paddle
+import paddle.v2.dataset.imdb as imdb
+import paddle.fluid as fluid
+from paddle.v2 import batch
+import paddle.fluid.profiler as profiler
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--emb_dim',
+        type=int,
+        default=512,
+        help='Dimension of embedding table. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=512,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=100,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='CPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--crop_size',
+        type=int,
+        default=int(os.environ.get('CROP_SIZE', '1500')),
+        help='The max sentence length of input. Since this model use plain RNN,'
+        ' Gradient could be explored if sentence is too long')
+    args = parser.parse_args()
+    return args
+
+
+word_dict = imdb.word_dict()
+
+
+def crop_sentence(reader, crop_size):
+    unk_value = word_dict['<unk>']
+
+    def __impl__():
+        for item in reader():
+            if len([x for x in item[0] if x != unk_value]) < crop_size:
+                yield item
+
+    return __impl__
+
+
+def main():
+    args = parse_args()
+    lstm_size = args.hidden_dim
+
+    data = fluid.layers.data(
+        name="words", shape=[1], lod_level=1, dtype='int64')
+    sentence = fluid.layers.embedding(
+        input=data, size=[len(word_dict), args.emb_dim])
+
+    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(
+                ipt,
+                hidden,
+                size, ):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            gate = fluid.layers.sums(input=[gate0, gate1])
+            return gate
+
+        forget_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        input_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        output_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        cell_gate = fluid.layers.tanh(
+            x=gate_common(word, prev_hidden, lstm_size))
+
+        cell = fluid.layers.sums(input=[
+            fluid.layers.elementwise_mul(
+                x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
+                    x=input_gate, y=cell_gate)
+        ])
+
+        hidden = fluid.layers.elementwise_mul(
+            x=output_gate, y=fluid.layers.tanh(x=cell))
+
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_pool(rnn(), 'last')
+    logit = fluid.layers.fc(input=last, size=2, act='softmax')
+    loss = fluid.layers.cross_entropy(
+        input=logit,
+        label=fluid.layers.data(
+            name='label', shape=[1], dtype='int64'))
+    loss = fluid.layers.mean(x=loss)
+
+    # add acc
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                shape=[1], dtype='int64'), total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    adam = fluid.optimizer.Adam()
+    adam.minimize(loss)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    def train_loop(pass_num, crop_size):
+        with profiler.profiler(args.device, 'total') as prof:
+            for pass_id in range(pass_num):
+                train_reader = batch(
+                    paddle.reader.shuffle(
+                        crop_sentence(imdb.train(word_dict), crop_size),
+                        buf_size=25000),
+                    batch_size=args.batch_size)
+                word_nums = 0
+                pass_start_time = time.time()
+                for batch_id, data in enumerate(train_reader()):
+                    tensor_words = to_lodtensor([x[0] for x in data], place)
+                    for x in data:
+                        word_nums += len(x[0])
+                    label = numpy.array([x[1] for x in data]).astype("int64")
+                    label = label.reshape((-1, 1))
+                    loss_np, acc, weight = exe.run(
+                        fluid.default_main_program(),
+                        feed={"words": tensor_words,
+                              "label": label},
+                        fetch_list=[loss, batch_acc, batch_size_tensor])
+                    print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" %
+                          (pass_id, batch_id, loss_np, acc))
+
+                pass_end_time = time.time()
+                time_consumed = pass_end_time - pass_start_time
+                words_per_sec = word_nums / time_consumed
+                print("pass_id=%d, sec/pass: %f, words/s: %f" %
+                      (pass_id, time_consumed, words_per_sec))
+
+    train_loop(args.pass_num, args.crop_size)
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bf78e4cf08d43127a05c740fa30ca6d2bc416b0
--- /dev/null
+++ b/benchmark/fluid/vgg.py
@@ -0,0 +1,220 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import argparse
+import functools
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NCHW',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, now only support NCHW.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+parser.add_argument(
+    '--with_test',
+    action='store_true',
+    help='If set, test the testset during training.')
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def main():
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    opts = optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    # test
+    def test(exe):
+        test_accuracy = fluid.average.WeightedAverage()
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            acc, weight = exe.run(inference_program,
+                                  feed={"pixel": img_data,
+                                        "label": y_data},
+                                  fetch_list=[batch_acc, batch_size_tensor])
+            test_accuracy.add(value=acc, weight=weight)
+        return test_accuracy.eval()
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    accuracy = fluid.average.WeightedAverage()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            loss, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+            accuracy.add(value=acc, weight=weight)
+            iters += 1
+            num_samples += len(data)
+            print(
+                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                (pass_id, iters, loss, acc)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        pass_train_acc = accuracy.eval()
+        train_losses.append(loss)
+        train_accs.append(acc)
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        train_elapsed = time.time() - start_time
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    print_arguments()
+    main()
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0159815fed81bdff6de3e561af569e9edc75f947
--- /dev/null
+++ b/cmake/external/threadpool.cmake
@@ -0,0 +1,30 @@
+INCLUDE(ExternalProject)
+
+SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
+SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
+INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
+
+ExternalProject_Add(
+    extern_threadpool
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/progschj/ThreadPool.git"
+    GIT_TAG         9a42ec1329f259a5f4881a291db1dcb8f2ad9040
+    PREFIX          ${THREADPOOL_SOURCE_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/threadpool_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_threadpool = \"${dummyfile}\";")
+    add_library(simple_threadpool STATIC ${dummyfile})
+else()
+    add_library(simple_threadpool INTERFACE)
+endif()
+
+add_dependencies(simple_threadpool extern_threadpool)
+
+LIST(APPEND external_project_dependencies simple_threadpool)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c749c97f13649fe8432091414b56f7d0ea8ace8b..3fe750f47efc149bb1af6086841bffd5dd8e85fd 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -587,6 +587,9 @@ function(grpc_library TARGET_NAME)
   get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
   get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
 
+  #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
+  # somehow it didn't. line 602 to 604 is to patching this. Leaving this here 
+  # for now to enable dist CI.
   protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
   set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
   set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
@@ -597,6 +600,9 @@ function(grpc_library TARGET_NAME)
           COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
           ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
           --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+          ARGS --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+          "${ABS_PROTO}"
           DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
 
   # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index a9b27933a5307aabeaf150aeb859e869197229f5..7066637a7cb27b83724cb4030c29a1019981f52b 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -1,2 +1,9 @@
+add_custom_target(paddle_apis ALL
+                  DEPENDS paddle_v2_apis paddle_fluid_apis)
+
+add_custom_target(paddle_docs ALL
+                  DEPENDS paddle_v2_docs paddle_v2_docs_cn
+                  paddle_fluid_docs paddle_fluid_docs_cn)
+
 add_subdirectory(v2)
 add_subdirectory(fluid)
diff --git a/doc/design/images/parallel_executor_overview.dot b/doc/design/images/parallel_executor_overview.dot
new file mode 100644
index 0000000000000000000000000000000000000000..40753cb140540c08d9d4c449b8d377e315280436
--- /dev/null
+++ b/doc/design/images/parallel_executor_overview.dot
@@ -0,0 +1,83 @@
+digraph G {
+  subgraph cluster_init {
+    label="Initialization"
+    startup_program [label="startup", shape=box]
+    node_w_g0 [label="W\nGPU0"]
+    startup_program -> node_w_g0 [label="Initialize"]
+    node_w_g1 [label="W\nGPU1"]
+    node_w_g0 -> node_w_g1 [label="broadcast"]
+  }
+
+  subgraph cluster_train {
+    label="forward_backward"
+
+    subgraph cluster_gpu0 {
+      label="GPU0"
+      fc_0 [label="fc\nGPU0", shape=box]
+      hidden_0 [label="hidden\nGPU0"]
+      node_w_g0 -> fc_0
+      fc_0 -> hidden_0
+      loss0 [label="loss\nGPU0"]
+      hidden_0 -> loss0 [label="many ops omitted"]
+      scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box]
+      loss_g0 [label="loss_grad\nGPU0"]
+      scale_loss_0->loss_g0
+      
+      fc_g_0 [label="w_grad\nGPU0", shape=box]
+      loss0 -> fc_g_0
+      loss_g0 -> fc_g_0
+      hidden_0 -> fc_g_0
+    }
+
+    subgraph cluster_gpu1 {
+      label="GPU1"
+      fc_1 [label="fc\nGPU1", shape=box]
+      hidden_1 [label="hidden\nGPU1"]
+      node_w_g1 -> fc_1
+      fc_1 -> hidden_1
+      loss1 [label="loss\nGPU1"]
+      hidden_1 -> loss1 [label="many ops omitted"]
+      scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box]
+      loss_g1 [label="loss_grad\nGPU1"]
+      scale_loss_1->loss_g1
+      
+      fc_g_1 [label="w_grad\nGPU1", shape=box]
+      loss1 -> fc_g_1
+      loss_g1 -> fc_g_1
+      hidden_1 -> fc_g_1
+    }
+  }
+
+  all_reduce_w [label="Merge Gradients(AllReduce)", shape=box]
+  fc_g_0 -> all_reduce_w
+  fc_g_1 -> all_reduce_w
+
+  fc_g_0_merged [label="w_grad\nMerged\nGPU0"]
+  fc_g_1_merged [label="w_grad\nMerged\nGPU1"]
+  all_reduce_w -> fc_g_0_merged
+  all_reduce_w -> fc_g_1_merged
+
+  subgraph cluster_optimization {
+    label="Optimization"
+    subgraph cluster_opt_gpu0 {
+      label="GPU0"
+      sgd_0 [label="SGD Op\nGPU0", shape=box]
+
+      fc_g_0_merged -> sgd_0
+      node_w_g0 -> sgd_0
+      optimized_w_0 [label="Optimized W\nGPU0"]
+      sgd_0 -> optimized_w_0
+    }
+    subgraph cluster_opt_gpu1 {
+      label="GPU1"
+      sgd_1 [label="SGD Op\nGPU1", shape=box]
+
+      fc_g_1_merged -> sgd_1
+      node_w_g1 -> sgd_1
+      optimized_w_1 [label="Optimized W\nGPU0"]
+      sgd_1 -> optimized_w_1
+    }
+  }
+
+
+}
diff --git a/doc/design/images/parallel_executor_overview.png b/doc/design/images/parallel_executor_overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..d890c0ffee3b38dc7cb74a2b56c2ab4831532211
Binary files /dev/null and b/doc/design/images/parallel_executor_overview.png differ
diff --git a/doc/design/parallel_executor.md b/doc/design/parallel_executor.md
new file mode 100644
index 0000000000000000000000000000000000000000..9aed3b059a1595ba3971d7d5acfc0d16a731584b
--- /dev/null
+++ b/doc/design/parallel_executor.md
@@ -0,0 +1,104 @@
+# ParallelExecutor
+
+## Background
+
+Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side.
+
+The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`.
+
+We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs. 
+
+ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs.
+
+
+## Overview of MultiGPUs logic
+
+The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters.
+
+![alt](images/parallel_executor_overview.png)
+
+There are several optimizations for this logic.
+
+1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization.
+2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready. 
+   * GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators.
+   * Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler.
+3. The streams of computation, merge gradients and fetch data are different.
+
+The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below.
+
+| Number of GPUs | 1 | 2 | 3 | 4|
+| --- | --- | --- | --- | --- |
+| Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 |
+| Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 |
+
+
+## Static single assignment Graph
+
+[Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`.
+
+The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph.
+
+The data structure of `SSA` graph is:
+
+```c++
+struct VarHandleBase {
+  OpHandleBase* generated_op_;
+  vector<OpHandleBase*> pending_ops_;
+  
+  string name;
+  Place place;
+  size_t version;
+};
+
+struct OpHandleBase {
+  vector<OpHandleBase*> inputs_;
+  vector<OpHnadleBase*> outputs_;
+};
+
+struct SSAGraph {
+  // vars on each devices. 
+  //   * the vars in each map in vector is on different device.
+  //   * the map is mapping a variable name to variable handles
+  //   with different versions
+  vector<std::unordered_map<string, vector<VarHandleBase>>> vars_;
+  
+  // All ops
+  vector<OpHandleBase> ops_;
+};
+```
+The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts.
+
+When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem.
+
+## Execute SSA Graph
+
+The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is
+
+1. Maintaining a map of an operator and its needed input number.
+2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators.
+3. If there is an operator which needed input number is decreased to zero, just run this operator.
+4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated.
+
+Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph.
+
+## Synchronize GPU Kernels
+
+The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm:
+
+1. `OpHandle` will record `DeviceContext` that it is used.
+2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
+
+The `wait` are implemented by two strategies:
+
+1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete.
+2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU.
+
+Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime.
+
+## What's next?
+
+* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done.
+* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too.
+* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision.
+* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator.
diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt
index cc999f5a8d70a2239ea3b130e9da172d5f681c65..9fe79323ef9377a459d8405cfa74c88c52ce9346 100644
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
@@ -27,6 +27,8 @@ sphinx_add_target(paddle_fluid_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
+add_dependencies(paddle_fluid_docs gen_proto_py)
+
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -47,3 +49,7 @@ sphinx_add_target(paddle_fluid_docs_cn
                   ${SPHINX_CACHE_DIR_CN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
+
+add_dependencies(paddle_fluid_docs_cn gen_proto_py)
+
+add_subdirectory(api)
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ca40dfb9644cea69329be0ec231378506c138bc0
--- /dev/null
+++ b/doc/fluid/api/CMakeLists.txt
@@ -0,0 +1,22 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_fluid_apis
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
diff --git a/doc/v2/api/fluid/data_feeder.rst b/doc/fluid/api/data_feeder.rst
similarity index 100%
rename from doc/v2/api/fluid/data_feeder.rst
rename to doc/fluid/api/data_feeder.rst
diff --git a/doc/v2/api/fluid/evaluator.rst b/doc/fluid/api/evaluator.rst
similarity index 100%
rename from doc/v2/api/fluid/evaluator.rst
rename to doc/fluid/api/evaluator.rst
diff --git a/doc/v2/api/fluid/executor.rst b/doc/fluid/api/executor.rst
similarity index 100%
rename from doc/v2/api/fluid/executor.rst
rename to doc/fluid/api/executor.rst
diff --git a/doc/v2/api/fluid/gen_doc.py b/doc/fluid/api/gen_doc.py
similarity index 100%
rename from doc/v2/api/fluid/gen_doc.py
rename to doc/fluid/api/gen_doc.py
diff --git a/doc/v2/api/fluid/gen_doc.sh b/doc/fluid/api/gen_doc.sh
similarity index 100%
rename from doc/v2/api/fluid/gen_doc.sh
rename to doc/fluid/api/gen_doc.sh
diff --git a/doc/v2/api/fluid/index.rst b/doc/fluid/api/index_en.rst
similarity index 100%
rename from doc/v2/api/fluid/index.rst
rename to doc/fluid/api/index_en.rst
diff --git a/doc/v2/api/fluid/initializer.rst b/doc/fluid/api/initializer.rst
similarity index 100%
rename from doc/v2/api/fluid/initializer.rst
rename to doc/fluid/api/initializer.rst
diff --git a/doc/v2/api/fluid/io.rst b/doc/fluid/api/io.rst
similarity index 100%
rename from doc/v2/api/fluid/io.rst
rename to doc/fluid/api/io.rst
diff --git a/doc/v2/api/fluid/layers.rst b/doc/fluid/api/layers.rst
similarity index 99%
rename from doc/v2/api/fluid/layers.rst
rename to doc/fluid/api/layers.rst
index ae35d8c53476b34cb18331364267dd7c8b94dd64..22e6fb13d7320986a60bc1ef5530187e0970c767 100644
--- a/doc/v2/api/fluid/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -494,6 +494,12 @@ reshape
 ..  autofunction:: paddle.fluid.layers.reshape
     :noindex:
 
+pad
+---
+
+..  autofunction:: paddle.fluid.layers.pad
+    :noindex:
+
 scale
 -----
 
diff --git a/doc/v2/api/fluid/nets.rst b/doc/fluid/api/nets.rst
similarity index 100%
rename from doc/v2/api/fluid/nets.rst
rename to doc/fluid/api/nets.rst
diff --git a/doc/v2/api/fluid/optimizer.rst b/doc/fluid/api/optimizer.rst
similarity index 100%
rename from doc/v2/api/fluid/optimizer.rst
rename to doc/fluid/api/optimizer.rst
diff --git a/doc/v2/api/fluid/param_attr.rst b/doc/fluid/api/param_attr.rst
similarity index 100%
rename from doc/v2/api/fluid/param_attr.rst
rename to doc/fluid/api/param_attr.rst
diff --git a/doc/v2/api/fluid/profiler.rst b/doc/fluid/api/profiler.rst
similarity index 100%
rename from doc/v2/api/fluid/profiler.rst
rename to doc/fluid/api/profiler.rst
diff --git a/doc/v2/api/fluid/regularizer.rst b/doc/fluid/api/regularizer.rst
similarity index 100%
rename from doc/v2/api/fluid/regularizer.rst
rename to doc/fluid/api/regularizer.rst
diff --git a/doc/fluid/build_and_install/build_from_source_cn.rst b/doc/fluid/build_and_install/build_from_source_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..ae4e8c7c48e584ec16a7be5466f83dd154ffb5fb
--- /dev/null
+++ b/doc/fluid/build_and_install/build_from_source_cn.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/build_from_source_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/build_from_source_en.rst b/doc/fluid/build_and_install/build_from_source_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..1ac828c973826bb8374c4aa8e17fda3ea1bb939f
--- /dev/null
+++ b/doc/fluid/build_and_install/build_from_source_en.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/build_from_source_en.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/docker_install_cn.rst b/doc/fluid/build_and_install/docker_install_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..965b2e20559291989422938c418fadbac16941b9
--- /dev/null
+++ b/doc/fluid/build_and_install/docker_install_cn.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/docker_install_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/docker_install_en.rst b/doc/fluid/build_and_install/docker_install_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..79d7341a7bbb9e477c773134f24983fd7607769a
--- /dev/null
+++ b/doc/fluid/build_and_install/docker_install_en.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/docker_install_en.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst
deleted file mode 100644
index 9276236f9fd511bde3570a8c88b437119911d60a..0000000000000000000000000000000000000000
--- a/doc/fluid/build_and_install/index_cn.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-安装与使用
-------------
diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..f697fcd8fac9131862ae7f8f51c5ebe93737ad2d
--- /dev/null
+++ b/doc/fluid/build_and_install/index_cn.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/index_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst
deleted file mode 100644
index cc1e61a58a026a0f5c3b106875a8a86dc9cba613..0000000000000000000000000000000000000000
--- a/doc/fluid/build_and_install/index_en.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Build and Install
-------------
diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..502f66a41319d4f41ae1774628ca36da9dca76ce
--- /dev/null
+++ b/doc/fluid/build_and_install/index_en.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/index_en.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/pip_install_cn.rst b/doc/fluid/build_and_install/pip_install_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..07deca84b82ff553e0c19324695089dcfb6be90e
--- /dev/null
+++ b/doc/fluid/build_and_install/pip_install_cn.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/pip_install_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/pip_install_en.rst b/doc/fluid/build_and_install/pip_install_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..7f39c998195b719b05443e96f1c4a6a8d44b98c9
--- /dev/null
+++ b/doc/fluid/build_and_install/pip_install_en.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/pip_install_en.rst
\ No newline at end of file
diff --git a/doc/fluid/design/algorithm/index_cn.rst b/doc/fluid/design/algorithm/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0883a9dc9c457f393ac1bdc930cb47ebcb0a25d9
--- /dev/null
+++ b/doc/fluid/design/algorithm/index_cn.rst
@@ -0,0 +1,7 @@
+梯度更新算法
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  parameter_average.md
diff --git a/doc/fluid/design/algorithm/index_en.rst b/doc/fluid/design/algorithm/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..59fe68dcf79ce2ef90b9adc829a0db45a4f0b3dc
--- /dev/null
+++ b/doc/fluid/design/algorithm/index_en.rst
@@ -0,0 +1,7 @@
+Gradient Update Algorithm
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  parameter_average.md
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
index 2c4edee9fe31d502ea62b9fe5c8757c0a4c5e79f..53d601d3a9a37e8adad519833bb6fa2dc48023a0 100644
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -7,7 +7,7 @@ Polyak and Juditsky (1992) showed that the test performance of simple average of
 
 Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="./images/theta_star.gif"/><br/> . The averaging is done as follows:
 
-<img src="./images/asgd.gif" align="center"/><br/>
+![](./images/asgd.gif)
 
 We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
 
diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md
index bf0e4dddc1b640ecbce489f65820aaf8a4b3b1e7..8ded0ad22f4013a521bf3bee260565dc5cf855ae 100644
--- a/doc/fluid/design/concepts/README.md
+++ b/doc/fluid/design/concepts/README.md
@@ -2,15 +2,37 @@ A few months ago when we were trying to replace CMake with Bazel, @emailweixu su
 
 Here are some initial thoughts. Your comments are welcome!
 
-### Required CMake Function
+# Required CMake Function
 
 I think we need only the following few CMake functions to make a project description mean and clean:
 
-| C++ | CUDA C++ | Go |
-|---|---|---|
-| cc_library | nv_library | go_library |
-| cc_binary | nv_binary | go_binary |
-| cc_test | nv_test | go_test |
+<table>
+<thead>
+<tr>
+<th>C++</th>
+<th>CUDA C++</th>
+<th>Go</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cc_library </td>
+<td>nv_library </td>
+<td>go_library </td>
+</tr>
+<tr>
+<td>cc_binary </td>
+<td>nv_binary </td>
+<td>go_binary </td>
+</tr>
+<tr>
+<td> cc_test </td>
+<td> nv_test </td>
+<td> go_test </td>
+</tr>
+</tbody>
+</table>
+
 
 - The `_library` functions generate  .a files from source code.
 - The `_binary` functions generate executable binary files.
@@ -25,7 +47,7 @@ Also,
 - to describe external dependencies, we need `external_library`.
 - to build shared libraries, we need `shared_library`.
 
-### An Example Project
+## An Example Project
 
 Suppose that we have aforementioned functions defined in our `/cmake` directory.  The following example `CMakeLists.txt` describes a project including the following source files:
 
@@ -102,11 +124,11 @@ shared_library(api
 
 ```
 
-### Implementation
+## Implementation
 
 As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
 
-### Using Package Manager For Go
+## Using Package Manager For Go
 
 Building Go binaries and libraries need to satisfy their dependencies, generally
 we can do `go get ./...` to download and compile all external dependencies. The
@@ -122,7 +144,7 @@ problems are:
    at many cloud file hosting, so users what to compile paddle by themselves can
    download this "vendor" package from a mirror site.
 
-#### Choose A Suitable Tool
+### Choose A Suitable Tool
 
 As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
 list dozens of Go package managers. We choose the tool using following principles:
@@ -140,7 +162,7 @@ management tool has been started at: https://github.com/golang/dep to resolve
 such problems, but it's currently at Alpha stage. So the best choice now is
 glide obviously.
 
-#### Manage Go Packages
+### Manage Go Packages
 
 - Dependencies: `go/glide.yaml` will store the dependencies and their versions which
   is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md
index 907a2def557fd472ac4d679c73447bd9107d1190..3b626bd89cd83a9428997abccfeeebbbbdbb3d38 100644
--- a/doc/fluid/design/concepts/block.md
+++ b/doc/fluid/design/concepts/block.md
@@ -14,11 +14,29 @@ In programming languages, a block is a pair of curly braces that includes local
 
 Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
 
-| programming languages | PaddlePaddle          |
-|-----------------------|-----------------------|
-| for, while loop       | RNN, WhileOp          |
-| if, if-else, switch   | IfElseOp, SwitchOp    |
-| sequential execution  | a sequence of layers  |
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>for, while loop </td>
+<td>RNN, WhileOp </td>
+</tr>
+<tr>
+<td>if, if-else, switch </td>
+<td>IfElseOp, SwitchOp </td>
+</tr>
+<tr>
+<td>sequential execution </td>
+<td>a sequence of layers </td>
+</tr>
+</tbody>
+</table>
+
 
 A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
 
@@ -26,12 +44,33 @@ A key difference is that a C++ program describes a one pass computation, whereas
 
 The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
 
-| programming languages | PaddlePaddle                    |
-|-----------------------|---------------------------------|
-| stack                 | scope hierarchy                 |
-| stack frame           | scope                           |
-| push at entering block| push at entering block          |
-| pop at leaving block  | destroy when minibatch completes|
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>stack </td>
+<td>scope hierarchy </td>
+</tr>
+<tr>
+<td>stack frame  </td>
+<td>scope </td>
+</tr>
+<tr>
+<td>push at entering block </td>
+<td>push at entering block </td>
+</tr>
+<tr>
+<td>pop at leaving block </td>
+<td>destroy when minibatch completes </td>
+</tr>
+</tbody>
+</table>
+
 
 1. In traditional programs:
 
diff --git a/doc/fluid/design/concepts/cpp_data_feeding.md b/doc/fluid/design/concepts/cpp_data_feeding.md
index 8607b40ccbbe01db77afed72c1efa780b520744c..aabc1ba75a67c5767d409bd6e7e6240dec86b16c 100644
--- a/doc/fluid/design/concepts/cpp_data_feeding.md
+++ b/doc/fluid/design/concepts/cpp_data_feeding.md
@@ -113,7 +113,7 @@ To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an e
 
 To create and invoke readers, some new ops are introduced:
 
-### CreateReaderOp
+### Operators That Create Readers
 
 Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
 
@@ -153,19 +153,52 @@ double_buffer_reader = create_double_buffer_op(batch_reader)
 The forwarding ops of the corresponding `main_program` would be like this:
 
 ```
-while_op {
+not_completed = true
+pass_count = 0
+while_op(not_completed) {
     has_next = has_next_op(double_buffer_reader)
     if_else_op(has_next) {
         batch_data = read_op(double_buffer_reader)
         ... (subsequent training ops)
     } else {
         reset_op(double_buffer_reader)
+        increase_op(pass_count)
+        not_completed = less_than_op(pass_count, reqiured_pass_num)
     }
 }
 ```
 
-Two important considerations for these programs are as follows:
+A few important considerations for these programs are as follows:
 
-1. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
+1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables.
 
-2. All readers exist in both `startup_program` and `main_program`. And they are persistable.
+2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
+
+3. All readers exist in both `startup_program` and `main_program`. And they are persistable.
+
+### Simplify Configuration by MultiPassReader
+
+The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`.
+
+`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`.
+
+With `MultiPassReader`, the startup program would be like this:
+
+```
+multiple_reader = open_files_op(...)
+batch_reader = create_batch_reader_op(multiple_reader)
+multi_pass_reader = create_multi_pass_reader_op(batch_reader)
+double_buffer_reader = create_double_buffer_op(multi_pass_reader)
+... (other initializers)
+```
+
+The forwarding part of the corresponding `main_program` would be like this:
+
+```
+not_completed = true
+while_op(not_completed) {
+    batch_data = read_op(double_buffer_reader)
+    ... (subsequent training ops)
+    not_completed = has_next_op(double_buffer_reader)
+}
+```
diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
index 984b59f4c6971dfb6f46dfe342f2751f392c0e88..30bc488a18a28d349645d9d2502aae6691a69931 100644
--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -86,12 +86,40 @@ def layer.fc(X):
 
 We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
 
-
-| C++ functions/functors | mul          | add          |             |          |
-|------------------------|--------------|--------------|-------------|----------|
-| C++ operator class     | mulOp        | addOp        | FCOp        |          |
-| Python binding         | operator.mul | operator.add | operator.fc |          |
-| Python function        |              |              |             | layer.fc |
+<table>
+<thead>
+<tr>
+<th>C++ functions/functors</th>
+<th>mul</th>
+<th>add</th>
+<th></th>
+<th></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>C++ operator class </td>
+<td>mulOp</td>
+<td>addOp </td>
+<td>FCOp </td>
+<td></td>
+</tr>
+<tr>
+<td>Python binding  </td>
+<td>operator.mul</td>
+<td> operator.add </td>
+<td>operator.fc </td>
+<td></td>
+</tr>
+<tr>
+<td>Python function   </td>
+<td></td>
+<td></td>
+<td> </td>
+<td>layer.fc</td>
+</tr>
+</tbody>
+</table>
 
 
 This is how we differentiate layer and operators in PaddlePaddle:
diff --git a/doc/fluid/design/concepts/index_cn.rst b/doc/fluid/design/concepts/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..eec8a2f14ca9e8b3bf0d0acbbb6004972790d795
--- /dev/null
+++ b/doc/fluid/design/concepts/index_cn.rst
@@ -0,0 +1,18 @@
+核心概念
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  README.md
+  cpp_data_feeding.md
+  functions_operators_layers.md
+  program.md
+  variable.md
+  var_desc.md
+  tensor.md
+  tensor_array.md
+  lod_tensor.md
+  block.md
+  scope.md
+  executor.md
diff --git a/doc/fluid/design/concepts/index_en.rst b/doc/fluid/design/concepts/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..036e1da2550cf520f5c40ecd9657f71603755adc
--- /dev/null
+++ b/doc/fluid/design/concepts/index_en.rst
@@ -0,0 +1,18 @@
+Core Concepts
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  README.md
+  cpp_data_feeding.md
+  functions_operators_layers.md
+  program.md
+  variable.md
+  var_desc.md
+  tensor.md
+  tensor_array.md
+  lod_tensor.md
+  block.md
+  scope.md
+  executor.md
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
index 10a8a7867fbf072f585fe3bfb1243e4e6bef4ec8..a88292e7888d0ebc64ee89ca315dfea38a12c71d 100644
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -2,12 +2,38 @@
 
 Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
 
-|                       | TensorFlow | PaddlePaddle |
-|-----------------------|------------|--------------|
-| RNN                   | Support    | Support      |
-| recursive RNN         | Support    | Support      |
-| padding zeros         | Must       | No need      |
-| blob data type        | Tensor     | LoDTensor    |
+<table>
+<thead>
+<tr>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>recursive RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>padding zeros </td>
+<td> Must </td>
+<td>No need </td>
+</tr>
+<tr>
+<td> blob data type </td>
+<td> Tensor</td>
+<td> LoDTensor </td>
+</tr>
+</tbody>
+</table>
+
 
 PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
 
diff --git a/doc/fluid/design/concepts/scope.md b/doc/fluid/design/concepts/scope.md
index 4da76eebb74abcd26ec2b8671399e6bc4fb58574..dcf76649357aaef80d6bc1a933ece8c4c1063547 100644
--- a/doc/fluid/design/concepts/scope.md
+++ b/doc/fluid/design/concepts/scope.md
@@ -30,7 +30,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 
    Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
 
-1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. 
+1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else.
 
    Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
 
@@ -78,7 +78,7 @@ In `Scope` class, there is a private data member called `parent_`. `parent_` is
 
 A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
 
-# Interface Design
+## Interface Design
 
 ```cpp
 class Variable {
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
index 6a45af1995463402ba9c65ddb51c6c8bb107f99e..6750323c0167bf1efbde6ef4fd670e88a5aa502a 100644
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -1,3 +1,5 @@
+# Design Doc: Var_desc
+
 ## Background
 PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
 
@@ -8,10 +10,27 @@ PaddlePaddle uses proto message to describe compile time program because :
 
 The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.
 
-| |compile time|runtime|
-|---|---|---|
-|Data|VarDesc(proto)|Variable(cpp)|
-|Operation|OpDesc(proto)|Operator(cpp)|
+<table>
+<thead>
+<tr>
+<th></th>
+<th>compile time</th>
+<th>runtime</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data </td>
+<td>VarDesc(proto) </td>
+<td>Variable(cpp) </td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>OpDesc(proto) </td>
+<td>Operator(cpp) </td>
+</tr>
+</tbody>
+</table>
 
 
 ## Definition of VarType
diff --git a/doc/fluid/design/concurrent/channel.md b/doc/fluid/design/concurrent/channel.md
new file mode 100644
index 0000000000000000000000000000000000000000..a00a3325e7b49381f0f82ebbf32b74683f02de5f
--- /dev/null
+++ b/doc/fluid/design/concurrent/channel.md
@@ -0,0 +1,139 @@
+# Channel Design
+
+## Introduction
+
+A Channel is a data structure that allows for synchronous interprocess 
+communication via message passing.  It is a fundemental component of CSP
+(communicating sequential processes), and allows for users to pass data
+between threads without having to worry about synchronization.
+
+## How to use it
+
+Paddle offers python APIs to open and close channels, along with sending
+and receiving data to/from a channel.
+
+### Create a channel
+
+Creates a new channel that takes in variables of a specific dtype.
+
+- **fluid.make_channel(dtype, capacity=0)**
+  - **dtype**: The data type of variables being sent/received through channel
+  - **capacity**: The capacity of the channel.  A capacity of 0 represents 
+    an unbuffered channel.  Capacity > 0 represents a buffered channel
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR, 10)
+```
+
+### Close a channel
+
+Closes a channel.  Any pending senders and receivers will be awoken during
+this time.  Receivers can still receive from a closed channel, but senders
+are not allowed to send any additional data to the channel (Paddle will
+raise an exception if users try to send to a closed channel.)
+
+- **fluid.channel_close(channel)**
+
+```
+fluid.channel_close(ch)
+```
+
+### Send data to a channel
+
+Sends a variable to a channel.  Currently, variables of dtype `LoDTensor`, 
+`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and 
+`ChannelHolder` are supported.
+
+By default, the data of the Variable is moved from the sender to the receiver,
+however the user can optionally copy the data before performing the send.
+
+- **channel_send(channel, variable, is_copy=False)**
+  - **channel**: The channel to send the variable to
+  - **variable**: The variable to send to the channel
+  - **is_copy**: If set to True, channel_send will perform a variable assign
+  to copy the source variable to a new variable to be sent.
+  
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
+fluid.channel_send(ch, var, True)
+```
+
+### Receive data from a channel
+
+Receives a variable from a channel.  The data of the variable is moved to the
+receiving variable.
+
+- **channel_recv(channel, return_variable)**
+  - **channel**: The channel to receive the variable from
+  - **return_variable**: The destination variable used to store the data of the
+  variable received from the channel
+  
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
+fluid.channel_recv(ch, var)
+```
+
+## How it Works
+
+Channels provides a simple interface for different threads to share data.
+To support the synchronization requirements, channels utilizes a series of
+internal queues, locks, and conditional variables.
+
+### QueueMessage
+
+QueueMessage encapsulates the state of the channel send/receive operation to be
+put in the **sendq/recvq**.  It contains a condition variable used to lock the 
+thread (when there are no available sends/receives).  In addition, it contains
+a callback function to notify a thread when the QueueMessage is being 
+processed by the channel.
+
+### Queues
+
+- **buff_**: This queue holds the data buffer in a buffered channel.  The
+capacity is set to the capacity of the channel.  This data buffer is not
+used in an unbuffered channel.
+
+- **sendq**: This queue holds the QueueMessage of any pending senders of a
+channel.  When a thread performs a channel_send operation on the channel, the
+channel_send operation will put a new QueueMessage on the sendq and block the
+current thread under two conditions:
+  1. The channel is buffered and is full
+  2. The channel is unbuffered and does not have a receiver
+
+- **recvq**:  This queue holds the QueueMessage of any pending receivers of a
+channel.  When a thread performs a channel_recv operation on the channel, the
+channel_recv operation will put a new QueueMessage on the recvq and block the
+current thread under two conditions:
+  1. The channel is buffered and there is no data on the buff_
+  2. The channel is unbuffered and does not have a sender
+  
+### State diagram
+
+#### Channel Send
+
+<p align="center">
+<img src="./images/channel_send.png"/><br/>
+</p>
+  
+#### Channel Receive
+
+<p align="center">
+<img src="./images/channel_recv.png"/><br/>
+</p>
+  
+## Limitations and Considerations
+
+### Variable Copy
+
+In golang, variables in channels are copied from the sender to the receiver.
+In Paddle, the data from our variables are **moved** from sender to receiver.
+As a result, these variables should not be used after they are sent.  We
+provide a flag in channel_send method to allow users to copy the variable to
+be sent before it is sent.  
+
+Please note that this is acheived by adding an **assign** operator and creating
+a temporary variable that is sent in place of the original variable.  Please
+note that **assign** operator has limited support for only certain variables 
+datatypes.
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
index f022e67fd3a048cd7e53c91d9a1fd0506487b665..64602166065af28309d7a01fdeb7076a9b0a081a 100644
--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -10,12 +10,38 @@ The answer relies on the fact that a `ProgramDesc` is similar to an abstract syn
 
 The following table compares concepts in Fluid and Go
 
-| Go | Fluid |
-|----|-------|
-|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) |
-| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
-| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
-| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Go</th>
+<th>Fluid</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>user-defined functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid">layers</a></td>
+</tr>
+<tr>
+<td>control-flow and built-in functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators">intrinsics/operators</a></td>
+</tr>
+<tr>
+<td>goroutines, channels </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h">class ThreadPool</a></td>
+</tr>
+<tr>
+<td>runtime </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h">class Executor</a></td>
+</tr>
+</tbody>
+</table>
+
 
 ## An Example Concurrent Program
 
@@ -77,11 +103,11 @@ message ProgramDesc {
       read(output = X)
       kube_get_workers_addrs(output = L)
       Y = tensor_array(len(L))
-      parallel_for(input = X, output = Y, 
+      parallel_for(input = X, output = Y,
                    attrs = {L, block_id(1)}) # referring to block 1
     ]
   }
-  
+
   block[1] = Block {
     parent = 0,
     vars = [x, y, index],
@@ -102,7 +128,7 @@ func main() {  //// block 0
   X = fluid.read(...)
   L = fluid.k8s.get_worker_addrs()
   Y = fluid.tensor_array(len(L))
-  fluid.parallel_for(X, L, 
+  fluid.parallel_for(X, L,
                      func(index int) {  //// block 1
                        x = X[index]
                        fluid.send(L[index], x)
@@ -116,7 +142,7 @@ An explanation of the above program:
 
 - `fluid.k8s` is a package that provides access to Kubernetes API.  
 - `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
-- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, 
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
 
   1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
   2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md
index 10d936860fab7e09241e968a63526c7d86d3e568..66d19f44baf861c7847e81ca83f61024ec877faf 100644
--- a/doc/fluid/design/concurrent/csp.md
+++ b/doc/fluid/design/concurrent/csp.md
@@ -13,14 +13,41 @@ Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously exe
 
 There were many concurrent programming models, implemented in various forms:
 
-| concurrent programming model | implementation |
-|-----|-----|
-| mutex | types and functions in standard libraries |
-| semaphore | types and functions in standard libraries |
-| communicating sequential processes (CSP) | Go programming language |
-| actor model | Erlang programming language |
-| message passing | MPI |
-| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
+<table>
+<thead>
+<tr>
+<th>concurrent programming model</th>
+<th>implementation</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>mutex </td>
+<td>types and functions in standard libraries </td>
+</tr>
+<tr>
+<td>semaphore </td>
+<td> types and functions in standard libraries </td>
+</tr>
+<tr>
+<td> communicating sequential processes (CSP)  </td>
+<td> Go programming language </td>
+</tr>
+<tr>
+<td> actor model  </td>
+<td> Erlang programming language </td>
+</tr>
+<tr>
+<td> message passing  </td>
+<td> MPI </td>
+</tr>
+<tr>
+<td> bulk synchronous parallel (BSP)   </td>
+<td> Pregel distributed programming framework </td>
+</tr>
+</tbody>
+</table>
+
 
 Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
 
@@ -118,9 +145,9 @@ There are four types of actions with a channel:
    ```go
    close(ch)
    ```
-   
+
    Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
-   
+
 There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
 
 1. A send to a nil channel blocks forever
diff --git a/doc/fluid/design/concurrent/go_op.md b/doc/fluid/design/concurrent/go_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..c18b788e80f432ebb2f14b15229e7823c112001e
--- /dev/null
+++ b/doc/fluid/design/concurrent/go_op.md
@@ -0,0 +1,231 @@
+# go_op Design
+
+## Introduction
+
+The **go_op** allows user's of PaddlePaddle to run program blocks on a detached
+thread.  It works in conjuction with CSP operators (channel_send, 
+channel_receive, channel_open, channel_close, and select) to allow users to
+concurrently process data and communicate easily between different threads.
+
+## How to use it
+
+```
+channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+with fluid.Go():
+    # Send a tensor of value 99 to "channel" on a detached thread
+    tensor = fill_constant(shape=[1], dtype='int', value=99)
+    tensor.stop_gradient = True
+    fluid.channel_send(channel, tensor)
+    
+# Receive sent tensor from "channel" on the main thread
+result = fill_constant(shape=[1], dtype='int', value=-1)    
+fluid.channel_recv(ch, result)  
+```
+
+The go operator can be accessed by using the fluid.Go() control flow.  This
+will create a new sub block, where the user can add additional operators
+to be ran on the thread.
+
+**Note:** Since back propegation is currently not support in the go_op, users
+should ensure that operators in the go block does not require gradient 
+calculations.
+
+## How it Works
+
+Similar to other control blocks, go_op will create a sub block and add it
+as a child to the current block.  Operators and variables defined in this
+block will be added to the go sub_block.
+
+In addition, the go operator will create a new child scope whose parent is
+the global scope.  Please refer to [block captures](#block-captures) for more
+information.
+
+When Paddle executor runs go_op, go_op will take the sub_block and pass it to
+the executor.run method (along with a newly created local scope) on a detached
+thread.
+
+An example of the generated program description is shown below.  Take note of
+the **go_op** in particular.  It is added as an operator in the current 
+block (in this example, block0).  The **go_op** contains a `sub_block`
+attribute, which points to the id of the block that will be executed in a 
+detached thread.
+
+```
+blocks {
+  idx: 0
+  parent_idx: -1
+  vars {
+    name: "return_value"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: INT64
+        }
+      }
+    }
+  }
+  vars {
+    name: "status_recv"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "channel"
+    }
+    type: "channel_create"
+    attrs {
+      name: "data_type"
+      type: INT
+      i: 7
+    }
+    attrs {
+      name: "capacity"
+      type: INT
+      i: 0
+    }
+  }
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "channel"
+    }
+    type: "go"
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 1
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "return_value"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status_recv"
+    }
+    type: "channel_recv"
+  }
+  ...
+}
+
+blocks {
+  idx: 1
+  parent_idx: 0
+  vars {
+    name: "status"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: 99.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 3
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status"
+    }
+    type: "channel_send"
+    attrs {
+      name: "copy"
+      type: BOOLEAN
+      b: false
+    }
+  }
+```
+
+## Current Limitations
+
+#### <a name="block-captures"></a>Scopes and block captures:
+
+Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a
+block.  When a block is executed, a new local scope is created from the parent
+scope (ie: scope derived from the parent block) and associated with the new 
+child block.  After the block finishes executing, then the local scope and
+all associated variables in the scope is deleted.
+
+This works well in a single threaded scenario, however with introduction of
+go_op, a child block may continue to execute even after the parent block has
+exited.  If the go_op tries to access variables located in the parent block's
+scope, it may receive a segmentation fault because the parent scope may have
+been deleted.
+
+We need to implement block closures in order to prevent access to parent
+scope variables from causing a segmentation fault.  As a temporary workaround,
+please ensure that all variables accessed in the go block is not destructed
+before it is being accessed.  Currently, the go_op will explicitly enforce 
+this requirement and raise an exception if a variable could not be found in 
+the scope.
+
+Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502)
+for more details.
+
+#### Green Threads
+
+Golang utilizes `green threads`, which is a mechnism for the runtime library to 
+manage multiple threads (instead of natively by the OS).  Green threads usually
+allows for faster thread creation and switching, as there is less overhead
+when spawning these threads.  For the first version of CSP, we only support
+OS threads.
+
+
+#### Backward Propegation:
+
+go_op currently does not support backwards propagation.  Please use go_op with
+non training operators.
diff --git a/doc/fluid/design/concurrent/images/channel_recv.png b/doc/fluid/design/concurrent/images/channel_recv.png
new file mode 100644
index 0000000000000000000000000000000000000000..c06cd15ae7b8a8c94d5742f6675e389081fcf789
Binary files /dev/null and b/doc/fluid/design/concurrent/images/channel_recv.png differ
diff --git a/doc/fluid/design/concurrent/images/channel_send.png b/doc/fluid/design/concurrent/images/channel_send.png
new file mode 100644
index 0000000000000000000000000000000000000000..006ebb4a5a4bcd32c97847e9fb7729a740255f7c
Binary files /dev/null and b/doc/fluid/design/concurrent/images/channel_send.png differ
diff --git a/doc/fluid/design/concurrent/index_cn.rst b/doc/fluid/design/concurrent/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e47135e9fc42760898083710e0a6767252a0225b
--- /dev/null
+++ b/doc/fluid/design/concurrent/index_cn.rst
@@ -0,0 +1,8 @@
+并发编程
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  concurrent_programming.md
+  parallel_do.md
diff --git a/doc/fluid/design/concurrent/index_en.rst b/doc/fluid/design/concurrent/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0727e75798b2a869588f80d3cce7a886554e4ffb
--- /dev/null
+++ b/doc/fluid/design/concurrent/index_en.rst
@@ -0,0 +1,8 @@
+Concurrent Programming
+-------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  concurrent_programming.md
+  parallel_do.md
diff --git a/doc/fluid/design/data_type/index_cn.rst b/doc/fluid/design/data_type/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b60167b6b1599df69dfc5073ebf32bdbb0a316ec
--- /dev/null
+++ b/doc/fluid/design/data_type/index_cn.rst
@@ -0,0 +1,7 @@
+数据类型
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  float16.md
diff --git a/doc/fluid/design/data_type/index_en.rst b/doc/fluid/design/data_type/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a88d17943f49134a2d00363845e919537ff4545
--- /dev/null
+++ b/doc/fluid/design/data_type/index_en.rst
@@ -0,0 +1,7 @@
+Data Type
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  float16.md
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
index e543adf0f97cc6b47415b807d7a1ed1effec9b22..988729138926f035750b59eb245dde82502a3ad2 100644
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -1,4 +1,4 @@
-## Design Doc: Distributed Lookup Table Operator
+# Design Doc: Distributed Lookup Table Operator
 
 A lookup table operator in PaddlePaddle where the table could be out
 of the memory of a computer.
diff --git a/doc/fluid/design/dist_train/index_cn.rst b/doc/fluid/design/dist_train/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed6f3dda271d2de58d92aa7ec804fa9e68dfc48a
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_cn.rst
@@ -0,0 +1,9 @@
+分布式训练
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
diff --git a/doc/fluid/design/dist_train/index_en.rst b/doc/fluid/design/dist_train/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f84688f168021113bd933802709bcd787b474bca
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_en.rst
@@ -0,0 +1,9 @@
+Distributed Training
+---------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
diff --git a/doc/fluid/design/dynamic_rnn/index_cn.rst b/doc/fluid/design/dynamic_rnn/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1d224d22cf7103616f44115db01f0ae55f1cb88a
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_cn.rst
@@ -0,0 +1,8 @@
+动态RNN
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  rnn.md
+  rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/index_en.rst b/doc/fluid/design/dynamic_rnn/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..568f496e4ffe21a5e730488aef905f7e2d98839e
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_en.rst
@@ -0,0 +1,8 @@
+Dynamic RNN
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  rnn.md
+  rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/rnn_design.md b/doc/fluid/design/dynamic_rnn/rnn_design.md
index 3d38b9a0ad225fd8e0c1bb037474b292b1887f5b..cecfcd3307ae4c4fa603220a360e9e124069fa58 100644
--- a/doc/fluid/design/dynamic_rnn/rnn_design.md
+++ b/doc/fluid/design/dynamic_rnn/rnn_design.md
@@ -99,7 +99,7 @@ private:
     - 由于传递过程是以复制`shared_ptr`的方式实现，因此框架只需要传递一次 `lod_start_pos`
 
 2. 对于不感知 `lod_start_pos` 的Op足够透明
-3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 
+3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据
 
 具体的设计分为以下3小节
 
@@ -189,7 +189,7 @@ struct SortedSeqItem {
 
 std::vector<SortedSeqItem> sorted_seqs;
 ```
-来追踪序列排序后的位置，并添加一个新的接口 
+来追踪序列排序后的位置，并添加一个新的接口
 
 ```c++
 std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
@@ -233,7 +233,10 @@ x    x
 - 将每个序列concat 为规则的mini-batch表示
 
 ## 参考文献
-1. [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
-2. [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
-3. [variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
-4. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
+[Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
+
+[mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
+
+[variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
+
+[Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/doc/fluid/design/execution/index_cn.rst b/doc/fluid/design/execution/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed31b017429d168b2466d8f6b423f48bd5d78d1f
--- /dev/null
+++ b/doc/fluid/design/execution/index_cn.rst
@@ -0,0 +1,8 @@
+执行流程
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  switch.md
+  if_else_op.md
diff --git a/doc/fluid/design/execution/index_en.rst b/doc/fluid/design/execution/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fcf846da348ff0bed707c42718e08314998fbac0
--- /dev/null
+++ b/doc/fluid/design/execution/index_en.rst
@@ -0,0 +1,8 @@
+Execution Process
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  switch.md
+  if_else_op.md
diff --git a/doc/fluid/design/execution/switch.md b/doc/fluid/design/execution/switch.md
index 827d0601c621e4a230de28e2baad8e196e69625e..1c337bd7159b25e594c2f91f9a143b3f4bc3c8e8 100644
--- a/doc/fluid/design/execution/switch.md
+++ b/doc/fluid/design/execution/switch.md
@@ -1,6 +1,6 @@
-### Design Doc: Switch
+# Design Doc: Switch
 
-### Background
+## Background
 
 Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
 
@@ -19,7 +19,7 @@ with switch() as switch:
         fluid.print("Case 3")
 ```
 
-### The Semantics
+## The Semantics
 
 1. A `switch` control-flow checks cases one-by-one.
 1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst
index f1887be6901653d4263d711d78b626d2abfd45c9..e9f55214f411abb11bef180d7af4716ad85a0b09 100644
--- a/doc/fluid/design/index_cn.rst
+++ b/doc/fluid/design/index_cn.rst
@@ -1,2 +1,19 @@
 设计思想
 ------------
+
+.. toctree::
+  :maxdepth: 1
+
+  motivation/index_cn.rst
+  execution/index_cn.rst
+  concepts/index_cn.rst
+  data_type/index_cn.rst
+  memory/index_cn.rst
+  muti_devices/index_cn.rst
+  dynamic_rnn/index_cn.rst
+  concurrent/index_cn.rst
+  algorithm/index_cn.rst
+  network/index_cn.rst
+  modules/index_cn.rst
+  interface/index_cn.rst
+  dist_train/index_cn.rst
diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst
index 18a4b4122f6e3f0096676f34ffea8a80aa9b6696..2802dc3a31d540c5a19bf9042053496aad152f98 100644
--- a/doc/fluid/design/index_en.rst
+++ b/doc/fluid/design/index_en.rst
@@ -1,2 +1,19 @@
 Design
 ------------
+
+.. toctree::
+  :maxdepth: 1
+
+  motivation/index_en.rst
+  execution/index_en.rst
+  concepts/index_en.rst
+  data_type/index_en.rst
+  memory/index_en.rst
+  muti_devices/index_en.rst
+  dynamic_rnn/index_en.rst
+  concurrent/index_en.rst
+  algorithm/index_en.rst
+  network/index_en.rst
+  modules/index_en.rst
+  interface/index_en.rst
+  dist_train/index_en.rst
diff --git a/doc/fluid/design/interface/index_cn.rst b/doc/fluid/design/interface/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..69a8d9bad4fe88935b9fa87757abf0105ca8eb75
--- /dev/null
+++ b/doc/fluid/design/interface/index_cn.rst
@@ -0,0 +1,4 @@
+多语言接口
+------------
+
+TBD
diff --git a/doc/fluid/design/interface/index_en.rst b/doc/fluid/design/interface/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..22abc71f984aa5da7151d5ebf0c3bdbcc69a3624
--- /dev/null
+++ b/doc/fluid/design/interface/index_en.rst
@@ -0,0 +1,4 @@
+Multi-Language Interface
+-----------------------
+
+TBD
diff --git a/doc/fluid/design/memory/index_cn.rst b/doc/fluid/design/memory/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c507c638bd1a6eb428175ed2756a6ecfc6cca198
--- /dev/null
+++ b/doc/fluid/design/memory/index_cn.rst
@@ -0,0 +1,7 @@
+内存管理
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  memory_optimization.md
diff --git a/doc/fluid/design/memory/index_en.rst b/doc/fluid/design/memory/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f7526437a73a09b300f05e138084755f5528b242
--- /dev/null
+++ b/doc/fluid/design/memory/index_en.rst
@@ -0,0 +1,7 @@
+Memory Management
+-------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  memory_optimization.md
diff --git a/doc/fluid/design/modules/evaluator.md b/doc/fluid/design/modules/evaluator.md
index 11cc129d56905a9ee666da92fbe6f8559c6d325a..de9605b0e67a035ab1ef1e4cafbe838f83bc5807 100644
--- a/doc/fluid/design/modules/evaluator.md
+++ b/doc/fluid/design/modules/evaluator.md
@@ -1,10 +1,10 @@
-## Evaluator Design
+# Evaluator Design
 
-### Problem Statement
+## Problem Statement
 
 During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
 
-### Evaluator Design
+## Evaluator Design
 Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
 
 1. Initialize the metric state and add it into the block.
@@ -14,11 +14,11 @@ Currently, every operation is expressed in the graph. We divide the evaluator pr
 
 3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
 
-### Implementation
-This design is shown in the Python API. 
-Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. 
+## Implementation
+This design is shown in the Python API.
+Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass.
+
 
-    
 ```python
 class Evaluator(object):
     """
@@ -32,7 +32,7 @@ class Evaluator(object):
 
        The initialization of Evaluator should be responsible for:
        create metric states and append to the main_program
-       """ 
+       """
        pass
 
     def _update_ops(self, input, label, **kwargs)
@@ -40,14 +40,14 @@ class Evaluator(object):
        Add mini-batch evaluator caculate operators to the main_program.
        Add increment operator to accumulate the metric states.
        """
-    
+
 
     def reset(self, executor, reset_program=None):
       """
       Reset metric states at the begin of each pass/user specified batch number.
       Execute the reset_program to reset the states.
       """
-      
+
 
     def eval(self, executor, eval_program=None):
       """
diff --git a/doc/fluid/design/modules/index_cn.rst b/doc/fluid/design/modules/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b25783f0f5120991c29ba31b7b512bd4c183eecf
--- /dev/null
+++ b/doc/fluid/design/modules/index_cn.rst
@@ -0,0 +1,14 @@
+代码结构和重要模块
+-----------------
+
+.. toctree::
+  :maxdepth: 1
+
+  backward.md
+  python_api.md
+  regularization.md
+  infer_var_type.md
+  optimizer.md
+  prune.md
+  register_grad_op.md
+  net_op_design.md
diff --git a/doc/fluid/design/modules/index_en.rst b/doc/fluid/design/modules/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2108156e080996916f2650448f0a56f998757204
--- /dev/null
+++ b/doc/fluid/design/modules/index_en.rst
@@ -0,0 +1,14 @@
+Code Structure and Important Modules
+-------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  backward.md
+  python_api.md
+  regularization.md
+  infer_var_type.md
+  optimizer.md
+  prune.md
+  register_grad_op.md
+  net_op_design.md
diff --git a/doc/fluid/design/modules/net_op_design.md b/doc/fluid/design/modules/net_op_design.md
index a5f0483081e8a03b2d001a551fcc02bbd392016d..e64ac2fb1c6898bfeb883250347da3d9a4757b97 100644
--- a/doc/fluid/design/modules/net_op_design.md
+++ b/doc/fluid/design/modules/net_op_design.md
@@ -1,16 +1,16 @@
 # Network Design
 
 `Network` is the container and controller of a set of operators,
-user can build a real network from a `NetDesc` which is a protobuf message 
+user can build a real network from a `NetDesc` which is a protobuf message
 and use `Network.Run()` to run all the operators in the network.
 
-A network object knows all Operators belonging to this network. Variables, 
-which are inputs and outputs of these operators, 
+A network object knows all Operators belonging to this network. Variables,
+which are inputs and outputs of these operators,
 are created and managed by a hierarchy of Scope objects.
 
-# API
+## API
 
-## Net
+### Net
 To make the `Network` extendable, a base class is defined like this
 
 ```c++
@@ -43,8 +43,8 @@ class Net {
 };
 ```
 
-All network implementations should build networks from a protobuf message which 
-describes the structure of a real network; `Run` method should be implemented by 
+All network implementations should build networks from a protobuf message which
+describes the structure of a real network; `Run` method should be implemented by
 all implementations to offer a universal method to forward or backward compute a network.
 
 `Net::Create` is a method of factory pattern and can be implemented like
@@ -64,7 +64,7 @@ std::unique<Net> Net::Create(const NetDesc& def) {
 ```
 
 Network is designed as the container of operators. to make it more extendable,
-we decouple it from the related variable resources. 
+we decouple it from the related variable resources.
 
 `Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes.
 
@@ -80,7 +80,7 @@ if (net) {
 }
 ```
 
-## `PlainNet` as a simple implementation of `BaseNet`
+### `PlainNet` as a simple implementation of `BaseNet`
 
 A very basic implementation is as follows. All it does is simply to run every operators in sequence.
 
@@ -211,9 +211,9 @@ class NetBuilder final {
 }
 ```
 
-## Compatibility with RNN
+### Compatibility with RNN
 
-Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design, 
+Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design,
 for example we can implement a simple recurrent neural network as follows
 
 ```c++
diff --git a/doc/fluid/design/modules/optimizer.md b/doc/fluid/design/modules/optimizer.md
index 691081c268b848811bf5ee6d6a41edfe0f47eec0..1c25fde9cafb322f789662077d3fc6cc1d64ce38 100644
--- a/doc/fluid/design/modules/optimizer.md
+++ b/doc/fluid/design/modules/optimizer.md
@@ -1,6 +1,6 @@
-## Optimizer Design
+# Optimizer Design
 
-### The Problem
+## The Problem
 
 A PaddlePaddle program, or a block, is a sequence of operators operating variables.  A training program needs to do three kinds of works:
 
@@ -19,7 +19,7 @@ It's true that users should be able to create all these operators manually by ca
 In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
 
 
-### High-level Python API to describe the training process
+## High-level Python API to describe the training process
 
 1. User write code to describe the network:
 
@@ -54,7 +54,7 @@ In this design, we propose a high-level API that automatically derives the optim
 	sess.run(target= opt_op_list, ...)
 	```
 
-#### Optimizer Python interface:
+### Optimizer Python interface:
 
 ```python
 class Optimizer(object):
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
index 73f6d7b90c7dca0d48109cf3d28d5f7cd56b5c0b..f83ad3b6a4e8b4d82d8fe8d4154a2739a9b9628b 100644
--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@@ -2,12 +2,33 @@
 
 Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
 
-| Python classes | Protobuf messages |
-| --- | --- |
-| Program | ProgramDesc |
-| Block | BlockDesc |
-| Operator | OpDesc |
-| Variable | VarDesc |
+<table>
+<thead>
+<tr>
+<th>Python classes</th>
+<th>Protobuf messages</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Program </td>
+<td>ProgramDesc </td>
+</tr>
+<tr>
+<td>Block  </td>
+<td>BlockDesc </td>
+</tr>
+<tr>
+<td>Operator </td>
+<td>OpDesc </td>
+</tr>
+<tr>
+<td>Variable </td>
+<td>VarDesc </td>
+</tr>
+</tbody>
+</table>
+
 
 Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
 
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
index 110b7d78bf12ac8328fb3a913e4386e75d63c995..5e147f8263e685a4665b5793f7127178cbc3cfdd 100644
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@@ -10,11 +10,37 @@ Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution
 
 Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
 
-| Existed since | model as sequence of layers | model as graph of operators | No model |
-|--|--|--|--|
-| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
-| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
-| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+<table>
+<thead>
+<tr>
+<th>Existed since</th>
+<th>model as sequence of layers</th>
+<th>model as graph of operators</th>
+<th>No model</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>2013 </td>
+<td>Caffe, Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
+<td>2015 </td>
+<td> </td>
+<td>TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td>   </td>
+<td> PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid</td>
+</tr>
+</tbody>
+</table>
+
 
 From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
 
diff --git a/doc/fluid/design/motivation/index_cn.rst b/doc/fluid/design/motivation/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7706e73eca644ed6db772fd77da947395313237f
--- /dev/null
+++ b/doc/fluid/design/motivation/index_cn.rst
@@ -0,0 +1,10 @@
+设计动机和目标
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  api.md
+  refactorization.md
+  fluid.md
+  fluid_compiler.md
diff --git a/doc/fluid/design/motivation/index_en.rst b/doc/fluid/design/motivation/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..10b64b257c604ced6b957d6d6018e8a363f00fac
--- /dev/null
+++ b/doc/fluid/design/motivation/index_en.rst
@@ -0,0 +1,10 @@
+Design Motivations and Goals
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  api.md
+  refactorization.md
+  fluid.md
+  fluid_compiler.md
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
index f93d6155e1764386b01d2f0df3f141ab75cd55d4..f199cc892f5e84f0a12abe3b8e5cace9849e7fa8 100644
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -36,11 +36,37 @@ At compile time, the Python program generates a protobuf message representation
 
 At runtime, the C++ program realizes the graph and runs it.
 
-| | Representation (protobuf messages) | Realization (C++ class objects) |
-|---|---|---|
-|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
-|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
-|Block|BlockDesc|Block|
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Representation (protobuf messages)</th>
+<th>Realization (C++ class objects) </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107">VarDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24">Variable</a></td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35">OpDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64">Operator</a></td>
+</tr>
+<tr>
+<td>Block </td>
+<td>BlockDesc </td>
+<td>Block </td>
+
+</tbody>
+</table>
+
 
 The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
 
@@ -97,13 +123,13 @@ Compile Time -> IR -> Runtime
 
 ---
 
-# Operator/OpWithKernel/OpKernel
+## Operator/OpWithKernel/OpKernel
 
 ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot)
 
 ---
 
-# Operator
+## Operator
 ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
 
 * `Operator` is the fundamental building block of the user interface.
@@ -113,7 +139,7 @@ Compile Time -> IR -> Runtime
 
 ---
 
-# OpWithKernel/Kernel
+## OpWithKernel/Kernel
 
 ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot)
 
@@ -124,7 +150,7 @@ Compile Time -> IR -> Runtime
 
 ---
 
-# Why separate Kernel and Operator
+## Why separate Kernel and Operator
 
 * Separate GPU and CPU code.
     * Make Paddle capable of running without GPU.
@@ -132,7 +158,7 @@ Compile Time -> IR -> Runtime
     * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
 ---
 
-# Libraries for Kernel development
+## Libraries for Kernel development
 
 * `Eigen::Tensor` contains basic math and element-wise functions.
     * Note that `Eigen::Tensor` has broadcast implementation.
@@ -143,16 +169,16 @@ Compile Time -> IR -> Runtime
 * Hand-writing `GPUKernel` and `CPU` code
     * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
 ---
-# Operator Registration
+## Operator Registration
 
-## Why is registration necessary?
+### Why is registration necessary?
 We need a method to build mappings between Op type names and Op classes.
 
-## How is registration implemented?
+### How is registration implemented?
 Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
 
 ---
-# The Registry Map
+## The Registry Map
 
 ### `OpInfoMap`
 
@@ -166,7 +192,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding
 - **`checker`**: Used to check attributes.
 
 ---
-# Related Concepts
+## Related Concepts
 
 ### Op_Maker
 It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
@@ -178,7 +204,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 ```
 
 ---
-# Registration Process
+## Registration Process
 1. Write an Op class and its gradient Op class, if required.
 2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
 3. Invoke the macro `REGISTER_OP`. This macro will
@@ -186,13 +212,13 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
 
 ---
-# Backward Module (1/2)
+## Backward Module (1/2)
 ### Create Backward Operator
 - Mapping from forward Op to backward Op
 ![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
 
 ---
-# Backward Module (2/2)
+## Backward Module (2/2)
 ### Build Backward Network
 - **Input**: a graph of forward operators
 - **Output**: a graph of backward operators
@@ -205,7 +231,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 
 
 ---
-# Scope, Variable, Tensor
+## Scope, Variable, Tensor
 
 * `Tensor` is an n-dimension array with type.
 	* Only dims and data pointers are stored in `Tensor`.
@@ -218,8 +244,8 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
 
 ---
-# Block (in design)
-## the difference between original RNNOp and Block
+## Block (in design)
+### the difference between original RNNOp and Block
 - As an operator is more intuitive than `RNNOp`,
 - Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
 - Fits the compile-time/ runtime separation design paradigm.
@@ -227,7 +253,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
   - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
 
 ---
-# Milestone
+## Milestone
 - Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
 - Model migration
   - Framework development gives **priority support** to model migration, for example,
@@ -240,7 +266,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 - Accept imperfection, concentrate on solving the specific problem at the right price.
 
 ---
-# Control the migration quality
+## Control the migration quality
 - Compare the performance of migrated models with old ones.
 - Follow the google C++ style guide.
 - Build the automatic workflow of generating Python/C++ documentations.
diff --git a/doc/fluid/design/muti_devices/index_cn.rst b/doc/fluid/design/muti_devices/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1f8439e8623e1c1ae9a12c24d08079f0ec3d761f
--- /dev/null
+++ b/doc/fluid/design/muti_devices/index_cn.rst
@@ -0,0 +1,9 @@
+多设备支持
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  operator_kernel_type.md
+  kernel_selection.md
+  kernel_hint_design.md
diff --git a/doc/fluid/design/muti_devices/index_en.rst b/doc/fluid/design/muti_devices/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..819e9c5d77b2abf8da0e2ce6f494ea5174c1d0a2
--- /dev/null
+++ b/doc/fluid/design/muti_devices/index_en.rst
@@ -0,0 +1,9 @@
+Multi-Device Support
+----------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  operator_kernel_type.md
+  kernel_selection.md
+  kernel_hint_design.md
diff --git a/doc/fluid/design/muti_devices/kernel_hint_design.md b/doc/fluid/design/muti_devices/kernel_hint_design.md
index a54b7da045e1a362626ef066f9ebb56af2c3181a..728c8f0b964c02c1efa019945f7427fa879d3aa1 100644
--- a/doc/fluid/design/muti_devices/kernel_hint_design.md
+++ b/doc/fluid/design/muti_devices/kernel_hint_design.md
@@ -1,4 +1,4 @@
-## Problem
+# Problem
 In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
 
 In the current design, we use KernelType to describe one kernel.
diff --git a/doc/fluid/design/muti_devices/kernel_selection.md b/doc/fluid/design/muti_devices/kernel_selection.md
index 9719e031c70979cd95400701efd30879662e19bc..39ea2b00090a864f95610d6d2846ca5e5c904e78 100644
--- a/doc/fluid/design/muti_devices/kernel_selection.md
+++ b/doc/fluid/design/muti_devices/kernel_selection.md
@@ -1,4 +1,4 @@
-## Background
+# Background
 Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
 
 The `OpKernelType ` is as follows:
diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md
index af0c6ef36feba9e0239e7a5f81a8dc9108b2471a..7f5dcf55f9f2a0fd27ffde100510dd8fee305381 100644
--- a/doc/fluid/design/network/deep_speech_2.md
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -1,4 +1,4 @@
-# DeepSpeech2 on PaddlePaddle: Design Doc 
+# DeepSpeech2 on PaddlePaddle: Design Doc
 
 We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
 
@@ -68,11 +68,33 @@ We roughly break down the project into 14 tasks:
 
 Tasks parallelizable within phases:
 
-Roadmap     | Description                               | Parallelizable Tasks 
------------ | :------------------------------------     | :--------------------
-Phase I	    | Simplified model & components             | *Task 1* ~ *Task 8*
-Phase II    | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
-Phase III   | Documentations                            | *Task13* ~ *Task14*
+<table>
+<thead>
+<tr>
+<th>Roadmap</th>
+<th>Description</th>
+<th> Parallelizable Tasks</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Phase I </td>
+<td>Simplified model & components </td>
+<td>Task 1 ~ Task 8</td>
+</tr>
+<tr>
+<td>Phase II </td>
+<td> Standard model & benchmarking & profiling</td>
+<td>Task 9 ~ Task 12 </td>
+</tr>
+<tr>
+<td>Phase III </td>
+<td> Documentations</td>
+<td> Task13 ~ Task14 </td>
+</tr>
+</tbody>
+</table>
+
 
 Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
 
@@ -102,37 +124,82 @@ We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar
 
 Key ingredients about the layers:
 
-- **Data Layers**: 
+- **Data Layers**:
    - Frame sequences data of audio **spectrogram** (with FFT).
-   - Token sequences data of **transcription** text (labels). 
+   - Token sequences data of **transcription** text (labels).
    - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
-- **2D Convolution Layers**: 
+- **2D Convolution Layers**:
    - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
    - With striding for only the first convlution layer.
    - No pooling for all convolution layers.
-- **Uni-directional RNNs** 
+- **Uni-directional RNNs**
 	- Uni-directional + row convolution: for low-latency inference.
 	- Bi-direcitional + without row convolution: if we don't care about the inference latency.
 - **Row convolution**:
 	- For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
-	- Not nessesary if with bi-direcitional RNNs. 
+	- Not nessesary if with bi-direcitional RNNs.
 	- "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
 - **Batch Normalization Layers**:
    - Added to all above layers (except for data and loss layer).
    - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
- 
-
-Required Components                     | PaddlePaddle Support                      | Need to Develop
-:-------------------------------------  | :--------------------------------------   | :-----------------------
-Data Layer I (Spectrogram)	            | Not supported yet.                        |  TBD (Task 3)
-Data Layer II (Transcription)           | `paddle.data_type.integer_value_sequence` | -
-2D Convolution Layer                    | `paddle.layer.image_conv_layer`           | -
-DataType Converter (vec2seq)            | `paddle.layer.block_expand`               | -
-Bi-/Uni-directional RNNs                | `paddle.layer.recurrent_group`            | -
-Row Convolution Layer                   | Not supported yet.                        | TBD (Task 4)
-CTC-loss Layer                          | `paddle.layer.warp_ctc`                   | -
-Batch Normalization Layer               | `paddle.layer.batch_norm`                 | -
-CTC-Beam search                         | Not supported yet.                        | TBD (Task 6)
+   
+<table>
+<thead>
+<tr>
+<th>Required Components</th>
+<th> PaddlePaddle Support</th>
+<th> Need to Develop</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data Layer I (Spectrogram) </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 3)</td>
+</tr>
+<tr>
+<td>Data Layer II (Transcription)  </td>
+<td> paddle.data_type.integer_value_sequence</td>
+<td> - </td>
+</tr>
+<tr>
+<td>2D Convolution Layer </td>
+<td> paddle.layer.image_conv_layer</td>
+<td> - </td>
+</tr>
+<tr>
+<td>DataType Converter (vec2seq)</td>
+<td> paddle.layer.block_expand</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Bi-/Uni-directional RNNs </td>
+<td>paddle.layer.recurrent_group</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Row Convolution Layer </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 4)</td>
+</tr>
+<tr>
+<td>CTC-loss Layer </td>
+<td>paddle.layer.warp_ctc</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Batch Normalization Layer </td>
+<td>paddle.layer.batch_norm</td>
+<td> - </td>
+</tr>
+<tr>
+<td>CTC-Beam search </td>
+<td>Not supported yet.</td>
+<td> TBD (Task 6) </td>
+</tr>
+</tbody>
+</table>
+
 
 ### Row Convolution
 
@@ -145,14 +212,14 @@ TODO by Assignees
 Figure 2. Algorithm for CTC Beam Search Decoder.
 </div>
 
-- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: 
-   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
+   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
    - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
 - An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
 - Such external scorer consists of language model, word count or any other custom scorers.
 - The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
-- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. 
- 
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
+
 
 ## Future Work
 
diff --git a/doc/fluid/design/network/index_cn.rst b/doc/fluid/design/network/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3557d55fe4dbae1f712e0760ca15111ec6f6792d
--- /dev/null
+++ b/doc/fluid/design/network/index_cn.rst
@@ -0,0 +1,7 @@
+复杂网络设计
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  sequence_decoder.md
diff --git a/doc/fluid/design/network/index_en.rst b/doc/fluid/design/network/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..73a7137236bdf0548d35721609351d6deca3013b
--- /dev/null
+++ b/doc/fluid/design/network/index_en.rst
@@ -0,0 +1,7 @@
+Complex Network Design
+------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  sequence_decoder.md
diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md
index 5596b2653ae6ed9917f77dad08f926bcb1fb3419..b50f18f21df0787b9761bf0935ed7f4384ff0f98 100644
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ b/doc/fluid/dev/api_doc_std_cn.md
@@ -45,11 +45,11 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
 - Python API Definition
 
   - 格式：
-    
+
       [Python API Definition]
-    
+
   - 示例
-  
+
       ```
       fc(input,
          size,
@@ -63,19 +63,19 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
       ```
 
 - Function Description
-  
+
   - 格式
 
       本模块应包含以下内容（排列顺序为文档撰写顺序）：
 
       [Function Description]
-  
+
       [Formula]
-    
+
       [Symbols' Descriptions if necessary]
-    
+
       [References if necessary]
- 
+
   - 示例
 
       [Function Description]
@@ -119,18 +119,18 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
       [References if necessary]
 
       因fc没有必要列出的参考文献，故该内容省略。其他情况下需明确给出对应的参考文献和对应连接，以 layer_norm 为例：
-      
+
       ```
       Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
       ```
-  
+
 
 - Args Description
-  
+
   - 格式
-  
+
       \[Arg's Name\][(Data Type, Default Value)][Description]
-  
+
   - 示例
 
       fc的部分参数注释如下：
@@ -145,35 +145,35 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
       ```
 
 - Returns
-  
+
   - 格式
-  
+
       [Name][Shape]
-  
+
   - 示例
-  
+
       ```
       Returns:
           A tensor variable storing the transformation result.
       ```
-  
+
       当返回值为包含多个参数的tuple时，应按顺序逐个介绍各参数，以dynamic_lstm为例：
-  
+
       ```
       Returns:
           A tuple containing:
             The hidden state of LSTM whose shape is (T X D).
             The cell state of LSTM whose shape is (T X D).
       ```
-  
+
 - Raises
 
   - 格式
-  
+
       [Exception Type][Condition]
 
   - 示例
-  
+
       ```
       Raises:
           ValueError: If the rank of the input is less than 2.
@@ -182,7 +182,7 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
 - Note
 
   - 格式
-  
+
      [Note]
 
   - 示例
@@ -198,15 +198,15 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
           2. When num_heads == 1, scaled_dot_product_attention has no learnable
              parameters.
       ```
-  
+
 - Examples
 
   - 格式
 
       \[Python Code Snipper]
-  
+
   - 示例
-  
+
       ```
       Examples:
           .. code-block:: python
diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..e57072d52fd162e92a3482aef33f99ab9394c532
--- /dev/null
+++ b/doc/fluid/dev/api_doc_std_en.md
@@ -0,0 +1,226 @@
+# API Doc Standard
+
+- [API Doc Structure](#API Doc Structure)
+- [Format and Examples](#Format and Examples)
+- [Complete Example](#Complete Example)
+
+
+## API Doc Structure
+
+API Doc should contain the following parts(please write them in order):
+
+- Python API Definition
+
+  The definition of API
+
+- Function Description
+
+  Description of API's function. 
+  The description includes: meaning, purpose and operation on input of API, reference and corresponding link(if any), formula(if necessary) and explanations of key variables in the formula.
+
+- Args Description
+
+  Description of API parameters.
+  Introduce parameters one by one according to the order in API definition.
+  The introduction includes: data type, default value(if any), meaning, etc.
+
+- Returns
+
+  Introduction of API returned value.
+  Introduce meaning of returned value, provide correspoding format if necessary.
+  If returned value is a tuple containing multiple parameters, then introduce parameters one by one in order.
+
+- Raises（if any）
+
+   Abnormality, error that may occur, and possible reasons. If there are more than one possible abnormity or error, they should be listed in order. 
+
+- Note（if any）
+
+  Matters needing attention. If there are more than one matters, they should be listed in order. 
+
+- Examples
+
+  Examples of how to use API.
+
+
+## Format and Examples
+
+API documentation must obey reStructuredText format, please refer to [here](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html).
+Format and examples of each part of API documantation are as follows: (take fc for example)
+
+- Python API Definition
+
+  - Format
+
+      [Python API Definition]
+
+  - Example
+
+      ```
+      fc(input,
+         size,
+         num_flatten_dims=1,
+         param_attr=None,
+         bias_attr=None,
+         act=None,
+         name=None,
+         main_program=None,
+         startup_program=None)
+      ```
+
+- Function Description
+
+  - Format
+
+      This part contains (please write them in order):
+
+      [Function Description]
+
+      [Formula]
+
+      [Symbols' Descriptions if necessary]
+
+      [References if necessary]
+
+  - Example
+
+      [Function Description]
+
+       ```
+       **Fully Connected Layer**
+
+       The fully connected layer can take multiple tensors as its inputs. It
+       creates a variable called weights for each input tensor, which represents
+       a fully connected weight matrix from each input unit to each output unit.
+       The fully connected layer multiplies each input tensor with its coresponding
+       weight to produce an output Tensor. If multiple input tensors are given,
+       the results of multiple multiplications will be sumed up. If bias_attr is
+       not None, a bias variable will be created and added to the output. Finally,
+       if activation is not None, it will be applied to the output as well.
+       ```
+
+      [Formula]
+
+      ```
+      This process can be formulated as follows:
+
+      .. math::
+
+           Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+      ```
+
+      [Symbols' Descriptions if necessary]
+
+      ```
+      In the above equation:
+
+      * :math:`N`: Number of the input.
+      * :math:`X_i`: The input tensor.
+      * :math:`W`: The weights created by this layer.
+      * :math:`b`: The bias parameter created by this layer (if needed).
+      * :math:`Act`: The activation function.
+      * :math:`Out`: The output tensor.
+      ```
+
+      [References if necessary]
+
+      Since there is no need for reference of fc, we omit them here. Under other circumstances, please provide explicit reference and link, take layer_norm for example: 
+
+      ```
+      Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
+      ```
+
+
+- Args Description
+
+  - Format
+
+      \[Arg's Name\][(Data Type, Default Value)][Description]
+
+  - Example
+
+      part of fc parameters are as follows:
+
+      ```
+      Args:
+          input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+              the input tensor(s) is at least 2.
+          param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+              parameters/weights of this layer.
+          name (str, default None): The name of this layer.
+      ```
+
+- Returns
+
+  - Format
+
+      [Name][Shape]
+
+  - Example
+
+      ```
+      Returns:
+          A tensor variable storing the transformation result.
+      ```
+
+      when returned value contain more than one tuple, please introduce every parameter in order, take dynamic_lstm for example:
+
+      ```
+      Returns:
+          A tuple containing:
+            The hidden state of LSTM whose shape is (T X D).
+            The cell state of LSTM whose shape is (T X D).
+      ```
+
+- Raises
+
+  - Format
+
+      [Exception Type][Condition]
+
+  - Example
+
+      ```
+      Raises:
+          ValueError: If the rank of the input is less than 2.
+      ```
+
+- Note
+
+  - Format
+
+     [Note]
+
+  - Example
+
+      there is no Note in fc, so we omit this part. If there is any note, please write clearly. If there are more than one notes, please list them in order. Take scaled\_dot\_product\_attention for example:
+
+      ```
+      Note:
+          1. When num_heads > 1, three linear projections are learned respectively
+             to map input queries, keys and values into queries', keys' and values'.
+             queries', keys' and values' have the same shapes with queries, keys
+             and values.
+          2. When num_heads == 1, scaled_dot_product_attention has no learnable
+             parameters.
+      ```
+
+- Examples
+
+  - Format
+
+      \[Python Code Snipper]
+
+  - Example
+
+      ```
+      Examples:
+          .. code-block:: python
+
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+      ```
+
+## Complete Example
+
+Complete Example of fc please see [here](src/fc.py)。
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index e1edf079fa0f85eb7f6709fd945fffae88625d01..f627437f354a12c79cad25c959409db29ecbd874 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -1,2 +1,13 @@
 开发标准
 ------------
+
+.. toctree::
+  :maxdepth: 1
+
+  new_op_cn.md
+  new_op_kernel.md
+  use_eigen_cn.md
+  name_convention.md
+  support_new_device.md
+  releasing_process.md
+  op_markdown_format.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index faf9dfcd315fddc4774c3717b41086fa6c6bf85a..0b65fed67ad45eb399b624184485a99a082d79e9 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -1,4 +1,13 @@
 Development
 ------------
 
-This is Development page
+.. toctree::
+  :maxdepth: 1
+
+  new_op_en.md
+  new_op_kernel.md
+  use_eigen_en.md
+  name_convention.md
+  support_new_device.md
+  releasing_process.md
+  op_markdown_format.md
diff --git a/doc/fluid/dev/name_convention.md b/doc/fluid/dev/name_convention.md
index a02b356f058da68442516c2705d0bac140f8ef18..75830ef28c67dc4694d899efe503084b7b5852e1 100644
--- a/doc/fluid/dev/name_convention.md
+++ b/doc/fluid/dev/name_convention.md
@@ -1,8 +1,8 @@
-## Operator's Parameter Name Convention
+# Operator's Parameter Name Convention
 
 To make the operator document itself more clear, we recommend operator names obey the listing conventions.
 
-### OpProtoMaker names
+## OpProtoMaker names
 
 When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator.
 
@@ -20,7 +20,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith
 - Order.
   - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
 
-### Best Practice
+## Best Practice
 
 Here we give some examples to show how these rules will be used.
 
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 92996585674b46f45549b972b9f295503b1c7f8c..0c3f88d9c31e05bec399c64bf6ade56e62e01f68 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -26,13 +26,32 @@
 
 依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorWithKernel`，后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
 
-
- 内容            | 定义位置
---------------  | :----------------------
-OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
-Op定义           | `.cc`文件
-Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
-注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake定义 </td>
+<td>`.cc`文件，Backward Op不需要定义OpProtoMake </td>
+</tr>
+<tr>
+<td>Op定义 </td>
+<td> `.cc`文件</td>
+</tr>
+<tr>
+<td>Kernel实现 </td>
+<td> CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。</td>
+</tr>
+<tr>
+<td>注册Op </td>
+<td> Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中</td>
+</tr>
+</tbody>
+</table>
 
 
 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
index da8b1bdd1082e439456daf25e9b3a1e8eb534375..a566a09131f86251b70d5435d0a483aa2a705b35 100644
--- a/doc/fluid/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
@@ -33,6 +33,33 @@ Op definition           | `.cc` files
 Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
 Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
 
+<table>
+<thead>
+<tr>
+<th>Information</th>
+<th> Where is it defined</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake definition </td>
+<td> `.cc`files, Backward Op does not need an OpProtoMake interface. </td>
+</tr>
+<tr>
+<td>Op definition  </td>
+<td> `.cc` files</td>
+</tr>
+<tr>
+<td>Kernel implementation  </td>
+<td> The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.</td>
+</tr>
+<tr>
+<td>Registering the Op  </td>
+<td> Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.</td>
+</tr>
+</tbody>
+</table>
+
 
 New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
 
@@ -279,7 +306,7 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass
 
       def test_check_output(self):
           self.check_output()
-          
+
       def test_check_grad_normal(self):
           self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
diff --git a/doc/fluid/dev/new_op_kernel_en.md b/doc/fluid/dev/new_op_kernel.md
similarity index 88%
rename from doc/fluid/dev/new_op_kernel_en.md
rename to doc/fluid/dev/new_op_kernel.md
index 123df0a7ee4943c0b789ef9cfa6e0804d0fdd564..55dea8d0a39232ede59d4663d6e1a47fbfc60853 100644
--- a/doc/fluid/dev/new_op_kernel_en.md
+++ b/doc/fluid/dev/new_op_kernel.md
@@ -1,14 +1,14 @@
-## Add Kernels for a New Device
+# Add Kernels for a New Device
 
-### Background
+## Background
 
 PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
 
 [This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
 
-### Write Kernels for A New Device 
+## Write Kernels for A New Device
 
-#### Add A New Device
+### Add A New Device
 
   For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24).  We will correct this ASAP.
 
@@ -23,7 +23,7 @@ enum class LibraryType {
 ```
 
 
-#### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
+### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
 
 If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
 
@@ -45,7 +45,7 @@ struct CUDAPlace {
 typedef boost::variant<CUDAPlace, CPUPlace> Place;
 ```
 
-#### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
+### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
 After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
 
 ```cpp
@@ -58,7 +58,7 @@ class DeviceContext {
 };
 ```
 
-#### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
+### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
 
 A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
 
@@ -85,7 +85,7 @@ class OpKernel : public OpKernelBase {
 ```
 
 
-#### Register the OpKernel to framework
+### Register the OpKernel to framework
 
 After writing the components described above, we should register the kernel to the framework.
 
@@ -107,7 +107,7 @@ take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/oper
 	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
     		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
     		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-    
+
 	REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
 	       paddle::operators::CUDNNConvOpKernel<float>,
 	       paddle::operators::CUDNNConvOpKernel<double>);
diff --git a/doc/fluid/dev/op_markdown_format.md b/doc/fluid/dev/op_markdown_format.md
index 0ee804d592252c727622cbe59b0644813db3c4fd..4e539d7992e5f67ee7b07193b59b6b425b73c9e5 100644
--- a/doc/fluid/dev/op_markdown_format.md
+++ b/doc/fluid/dev/op_markdown_format.md
@@ -15,26 +15,26 @@ The signature of the operator.
 
 Each section mentioned above has been covered in further detail in the rest of the document.
 
-# PaddlePaddle Operator Name
+## PaddlePaddle Operator Name
 This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
 `array to lod tensor` should be written as `array_to_lod_tensor`.
 
 This naming convention should be standard across all PaddlePaddle operators.
 
-# Standard Operator Name
+## Standard Operator Name
 This is the standard name of the operator as used in the community. The general standard is usually:
 - Standard abbreviations like `SGD` are written in all capital letters.
 - Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
 - Keep numbers inside a word as is, with no boundary delimiters.
 - Follow the name of the operator with the keyword: `Activation Operator.`
 
-# Operator description
+## Operator description
 This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
 
-# LaTeX equation
+## LaTeX equation
 This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
 
-# The signature
+## The signature
 This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
 `Section :
 VariableName : (VariableType) VariableDescription
diff --git a/doc/fluid/dev/releasing_process.md b/doc/fluid/dev/releasing_process.md
index b9787261092f1f27377886152cb1596d9ff54188..0810765b85f73d9dba876e66fb43bb1ad476d6d2 100644
--- a/doc/fluid/dev/releasing_process.md
+++ b/doc/fluid/dev/releasing_process.md
@@ -66,7 +66,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
 	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的功能分支。
 	* 当功能分支开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
-		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。 
+		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。
 
 * BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
 
@@ -78,13 +78,116 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 
 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
 
-| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| API.V2 + Docker + GPU  |  |  |  |  |  |  |  |  |
-| API.V2 + Docker + CPU  |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Docker + GPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Docker + CPU |  |  |  |  |  |  |  |  |
-| API.V2 + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-| API.V2 + Ubuntu + CPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Ubuntu + CPU |  |  |  |  |  |  |  |  |
+<table>
+<thead>
+<tr>
+<th></th>
+<th>新手入门章节 </th>
+<th> 识别数字</th>
+<th> 图像分类</th>
+<th>词向量</th>
+<th> 情感分析</th>
+<th>语意角色标注</th>
+<th> 机器翻译</th>
+<th>个性化推荐</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md
index f36843b4408c21bdca1fa83853e5b0a40116791c..75922e7d85a13e53ce94619a48d8da8b960e6c9a 100644
--- a/doc/fluid/dev/use_eigen_cn.md
+++ b/doc/fluid/dev/use_eigen_cn.md
@@ -1,16 +1,16 @@
-## 在Paddle中如何使用Eigen
+# 在Paddle中如何使用Eigen
 
 神经网络本质上是一个计算图，计算需要的数据存放在`Tensor`中，而计算过程是由`Operartor`来描述的。在执行时，`Operator`调用对应`OpKernel`中的`Compute`接口，实现对`Tensor`的操作。
 
 
-### Eigen Tensor模块
+## Eigen Tensor模块
 
 Eigen Tensor模块对element-wise计算提供了强大的支持，并且书写一份代码，可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块，因此可能测试不够完备，文档较少。
 
 关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
 
 
-### paddle::framework::Tensor
+## paddle::framework::Tensor
 
 Paddle Tensor定义在framework目录下，其主要接口如下：
 
@@ -20,14 +20,14 @@ class Tensor {
   /*! Return a pointer to mutable memory block. */
   template <typename T>
   inline T* data();
-  
+
   /**
    * @brief   Return a pointer to mutable memory block.
    * @note    If not exist, then allocation.
    */
   template <typename T>
   inline T* mutable_data(platform::Place place);
-  
+
   /**
    * @brief     Return a pointer to mutable memory block.
    *
@@ -38,17 +38,17 @@ class Tensor {
    */
   template <typename T>
   inline T* mutable_data(DDim dims, platform::Place place);
-  
+
   /*! Resize the dimensions of the memory block. */
   inline Tensor& Resize(const DDim& dims);
-  
+
   /*! Return the dimensions of the memory block. */
   inline const DDim& dims() const;
 
  private:  
   /*! holds the memory block if allocated. */
   std::shared_ptr<Placeholder> holder_;
-  
+
   /*! points to dimensions of memory block. */
   DDim dim_;
 };
@@ -129,7 +129,7 @@ From是EigenTensor模板提供的一个接口，可以实现从paddle::framework
 
 
 
-### 实现计算
+## 实现计算
 
 当需要完成计算时，我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是，这里的EigenTensor之间的运算只是改变了原有Tensor中的数据，而不会改变原有Tensor的shape信息。
 
diff --git a/doc/fluid/dev/use_eigen_en.md b/doc/fluid/dev/use_eigen_en.md
index 3a466f73d1f9b94a29b171015279c782ca50bd02..3313d097cb21e40c23aa13187b6a50562f12403a 100644
--- a/doc/fluid/dev/use_eigen_en.md
+++ b/doc/fluid/dev/use_eigen_en.md
@@ -1,9 +1,9 @@
-## How to use Eigen in Paddle
+# How to use Eigen in Paddle
 
 Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`.
 
 
-### Eigen Tensor Module
+## Eigen Tensor Module
 
 The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU.
 
@@ -12,7 +12,7 @@ Note that Eigen Tensor is still being actively developed, so its tests are not c
 For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md).
 
 
-### paddle::framework::Tensor
+## paddle::framework::Tensor
 
 Paddle Tensor's is defined in the framework directory with the following interface:
 
@@ -105,7 +105,7 @@ void Compute(const framework::ExecutionContext& context) const override {
 ```
 
 
-### paddle::framework::Tensor到EigenTensor的转换
+## paddle::framework::Tensor到EigenTensor的转换
 
 As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
 
@@ -129,7 +129,7 @@ For more transformations, see the [unit tests](https://github.com/PaddlePaddle/P
 
 
 
-### Implementing Computation
+## Implementing Computation
 
 While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor.
 
diff --git a/doc/fluid/getstarted/concepts/index_cn.rst b/doc/fluid/getstarted/concepts/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2e7f70fc4cb871a80ffaffec6c06797973cd2f85
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/index_cn.rst
@@ -0,0 +1,4 @@
+基本使用概念
+============
+
+TBD
diff --git a/doc/fluid/getstarted/concepts/index_en.rst b/doc/fluid/getstarted/concepts/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..78cca1e2a3443c2949ca0655190b0f05502f519a
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/index_en.rst
@@ -0,0 +1,4 @@
+Concepts
+============
+
+TBD
diff --git a/doc/fluid/getstarted/concepts/save_model/model_format.md b/doc/fluid/getstarted/concepts/save_model/model_format.md
index e29129fddf775939c9f7a8b49d850d523e6e5a45..1f12ba0497369eacc6a2db7984781b5672f45ea1 100644
--- a/doc/fluid/getstarted/concepts/save_model/model_format.md
+++ b/doc/fluid/getstarted/concepts/save_model/model_format.md
@@ -4,30 +4,70 @@
 
 A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code.
 
-As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. 
+As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters.
 
 ## Implementation
 
-The topology is saved as a plain text in a detailed self-contain protobuf file. 
+The topology is saved as a plain text in a detailed self-contain protobuf file.
 
 The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task.
 
-As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, 
+As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is,
 
 The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
 
-|field name  | type | description |
-| --- | --- | --- |
-| version | uint32_t | Version of saved file. Always 0 now. |
-| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. |
-| tensor desc | void* | TensorDesc protobuf binary message |
-| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` |
-| lod_level | uint64_t | Level of LoD |
-| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. |
-| data of lod[0] | uint64_t*  | [Optional] lod[0].data() |
-| ... | ... | ... |
-
+<table>
+<thead>
+<tr>
+<th>field name</th>
+<th>type </th>
+<th>description </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> version</td>
+<td> uint32_t </td>
+<td> Version of saved file. Always 0 now.</td>
+</tr>
 
+<tr>
+<td> tensor desc length  </td>
+<td> uint32_t </td>
+<td> TensorDesc(Protobuf message) length in bytes. </td>
+</tr>
+<tr>
+<td>tensor desc </td>
+<td> void*</td>
+<td> TensorDesc protobuf binary message </td>
+</tr>
+<tr>
+<td> tensor data </td>
+<td> void* </td>
+<td> Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` </td>
+</tr>
+<tr>
+<td> lod_level</td>
+<td> uint64_t </td>
+<td> Level of LoD </td>
+</tr>
+<tr>
+<td> length of lod[0] </td>
+<td> uint64_t </td>
+<td> [Optional] length of lod[0] in bytes. </td>
+</tr>
+<tr>
+<td> data of lod[0] </td>
+<td> uint64_t*   </td>
+<td> [Optional] lod[0].data() </td>
+</tr>
+<tr>
+<td>... </td>
+<td> ... </td>
+<td> ... </td>
+</tr>
+</tbody>
+</table>
 
 ## Summary
 
diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst
index c4d8525f23ee18cb7f41ab2f0d148fc1dcc852b2..75af7354be93a6eeabfa9ccf86903505402a7ca6 100644
--- a/doc/fluid/getstarted/index_cn.rst
+++ b/doc/fluid/getstarted/index_cn.rst
@@ -1,4 +1,19 @@
 新手入门
-------------
+============
 
-新手入门
+
+如果需要快速了解PaddlePaddle的使用，可以参考以下指南。
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_cn.rst
+
+
+在使用PaddlePaddle构建应用时，需要了解一些基本概念。
+这里以一个线性回归为例子，详细介绍了PaddlePaddle的使用流程，包括数据格式，模型配置与训练等。
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/use_concepts_cn.rst
diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst
index a4efd05e2fd94ac0e2cbbc8603e6b0261b7e787f..75a43f4af87c34830ec940068196e6ca72640501 100644
--- a/doc/fluid/getstarted/index_en.rst
+++ b/doc/fluid/getstarted/index_en.rst
@@ -1,4 +1,18 @@
 GET STARTED
-------------
+============
 
-This is get started page
+If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_en.rst
+
+While using PaddlePaddle to build applications, please understand some basic concepts.
+
+Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/index_en.rst
diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..93a9e4e37a8495c553cec257c27363ca8d062d39
--- /dev/null
+++ b/doc/fluid/getstarted/quickstart_cn.rst
@@ -0,0 +1 @@
+../../v2/getstarted/quickstart_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..6e1894faa1176bb9e77f616e07df36191e54b782
--- /dev/null
+++ b/doc/fluid/getstarted/quickstart_en.rst
@@ -0,0 +1 @@
+../../v2/getstarted/quickstart_en.rst
\ No newline at end of file
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
index 1b6f767869aaa800c122c8e7a06a1413e48e10e0..b99b90056b0a2e51f2668a6d27d94857bdc09c37 100644
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -65,10 +65,10 @@ exit(1)
 
 **因此，在分布式的Fluid环境中，我们有两个角色需要创建，分别是Parameter Server和Trainer。**
 
-### 分布式训练 
+### 分布式训练
 Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数，将他们分隔为两部分，通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
 ```python
-optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) 
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
 ```
 将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下：
 ```python
@@ -99,15 +99,51 @@ for pass_id in range(100):
 ### 分布式训练脚本运行说明
 分布式任务的运行需要将表格中说明的多个参数进行赋值:
 
-| 参数名 | 值类型 | 说明 | 示例 |
-|:-------------|:------|:---------------------------------------|:-------------|
-| trainer_id | int | 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 | 0/1/2/3 |
-| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 |
-| trainers | int | 训练节点的总个数，>0的数字 | 4 |
-| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 |
-| training_role | str | 节点角色， TRAINER/PSERVER | PSERVER |
-
-**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下： 
+<table>
+<thead>
+<tr>
+<th>参数名</th>
+<th> 值类型</th>
+<th>说明</th>
+<th> 示例</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>trainer_id </td>
+<td> int</td>
+<td> 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 </td>
+<td> 0/1/2/3  </td>
+</tr>
+<tr>
+<td>pservers </td>
+<td> str</td>
+<td> parameter server 列表 </td>
+<td> 127.0.0.1:6710,127.0.0.1:6711 </td>
+</tr>
+<tr>
+<td>trainers </td>
+<td>int </td>
+<td> 训练节点的总个数，>0的数字 </td>
+<td> 4 </td>
+</tr>
+<tr>
+<td> server_endpoint</td>
+<td> str </td>
+<td> 当前所起的服务节点的IP:PORT </td>
+<td> 127.0.0.1:8789 </td>
+</tr>
+<tr>
+<td> training_role</td>
+<td>str </td>
+<td> 节点角色， TRAINER/PSERVER </td>
+<td> PSERVER </td>
+</tr>
+</tbody>
+</table>
+
+
+**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下：
 
 ```python
 t = fluid.DistributeTranspiler()
diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst
index a92abad0c56a4fd821f9a6b9f4f5909504c8aaf1..97aeaf167d329529f2b120b5a3d4085e0510fe16 100644
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
@@ -1,2 +1,7 @@
 进阶使用
 ------------
+
+.. toctree::
+  :maxdepth: 1
+  
+  optimization/index_cn.rst
diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst
index 06036bdce554a96443ea1fa47c15f7670ea6089d..fd21e167ce3a46da167db1e9d7013804f730e047 100644
--- a/doc/fluid/howto/index_en.rst
+++ b/doc/fluid/howto/index_en.rst
@@ -1,4 +1,7 @@
 HOW TO
 ------------
 
-This is how to page
+.. toctree::
+  :maxdepth: 1
+
+  optimization/index_en.rst
diff --git a/doc/fluid/howto/optimization/benchmark/README.md b/doc/fluid/howto/optimization/benchmark/README.md
new file mode 120000
index 0000000000000000000000000000000000000000..db30af7f53231c687f9ad61ad961a685733cbad0
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/README.md
@@ -0,0 +1 @@
+../../../../../benchmark/cluster/README.md
\ No newline at end of file
diff --git a/doc/fluid/howto/optimization/benchmark/index_cn.rst b/doc/fluid/howto/optimization/benchmark/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9404800eb86ca6d27886258b67393028c76954dc
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/index_cn.rst
@@ -0,0 +1,8 @@
+基准
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  vgg16/README.md
+  README.md
diff --git a/doc/fluid/howto/optimization/benchmark/index_en.rst b/doc/fluid/howto/optimization/benchmark/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1e200b660cc7f6aeaf8b3d94fd7a14999a52bccd
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/index_en.rst
@@ -0,0 +1,8 @@
+Benchmark
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  vgg16/README.md
+  README.md
diff --git a/doc/fluid/howto/optimization/benchmark/vgg16/README.md b/doc/fluid/howto/optimization/benchmark/vgg16/README.md
new file mode 120000
index 0000000000000000000000000000000000000000..ca963ef5f06aa0c2fe507ba7548dca8017358120
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/vgg16/README.md
@@ -0,0 +1 @@
+../../../../../../benchmark/cluster/vgg16/README.md
\ No newline at end of file
diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md
index d59be670c2b33b64d9b6f96b53f50e5bf9f0613b..8266dec3c6125a09b90ac0ccd4aa5464f5c7db31 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -8,7 +8,7 @@ PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大
 * Python 与 C++ 混合代码的性能分析
 
 
-## Python代码的性能分析
+# Python代码的性能分析
 
 ### 生成性能分析文件
 
@@ -42,14 +42,40 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 
 每一列的含义是:
 
-| 列名 | 含义 |
-| --- | --- |
-| ncalls | 函数的调用次数 |
-| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
-| percall | tottime的每次调用平均时间 |
-| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
-| percall | cumtime的每次调用平均时间 |
-| filename:lineno(function) | 文件名, 行号，函数名 |
+<table>
+<thead>
+<tr>
+<th>列名</th>
+<th>含义 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> 函数的调用次数</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> 函数总时间。包含这个函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> 文件名, 行号，函数名 </td>
+</tr>
+</tbody>
+</table>
 
 
 ### 寻找性能瓶颈
diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
index 01e5fddf61547f9fc86ef18a6f2e2ac508d22dbb..e95556dd608b7ff0a3eb18873df0015a2da94e7c 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -14,7 +14,7 @@ the profiling and tuning of
 1. the Python code and
 1. the mixture of Python and C++ code.
 
-## Profiling the Python Code
+# Profiling the Python Code
 
 ### Generate the Performance Profiling File
 
@@ -57,14 +57,40 @@ port, we will see the output like the following:
 where each line corresponds to Python function, and the meaning of
 each column is as follows:
 
-| column | meaning |
-| --- | --- |
-| ncalls | the number of calls into a function |
-| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
-| percall | tottime divided by ncalls |
-| cumtime | the total execution time of the function, including the execution time of other functions being called |
-| percall | cumtime divided by ncalls |
-| filename:lineno(function) | where the function is defined |
+<table>
+<thead>
+<tr>
+<th>column</th>
+<th>meaning </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> the number of calls into a function</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> the total execution time of the function, not including the execution time of other functions called by the function</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime divided by ncalls</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> the total execution time of the function, including the execution time of other functions being called</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime divided by ncalls</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> where the function is define </td>
+</tr>
+</tbody>
+</table>
 
 ### Identify Performance Bottlenecks
 
@@ -81,7 +107,7 @@ focus on. We can sort above profiling file by tottime:
 
 We can see that the most time-consuming function is the `built-in
 method run`, which is a C++ function in `libpaddle.so`.  We will
-explain how to profile C++ code in the next section.  At this 
+explain how to profile C++ code in the next section.  At this
 moment, let's look into the third function `sync_with_cpp`, which is a
 Python function.  We can click it to understand more about it:
 
diff --git a/doc/fluid/howto/optimization/index_cn.rst b/doc/fluid/howto/optimization/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..27cc96702356703b339db845dc81913bdcc9f23b
--- /dev/null
+++ b/doc/fluid/howto/optimization/index_cn.rst
@@ -0,0 +1,9 @@
+性能优化
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  timeline.md
+  cpu_profiling_cn.md
+  benchmark/index_cn.rst
diff --git a/doc/fluid/howto/optimization/index_en.rst b/doc/fluid/howto/optimization/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4ce624fe8f108a6afc7cd08a1542332755d22e04
--- /dev/null
+++ b/doc/fluid/howto/optimization/index_en.rst
@@ -0,0 +1,9 @@
+Performance Optimization
+---------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  timeline.md
+  cpu_profiling_en.md
+  benchmark/index_en.rst
diff --git a/doc/fluid/howto/optimization/timeline.md b/doc/fluid/howto/optimization/timeline.md
index 9d9565a3e698a83ca465c5da83ff892360c33b8f..96481ae2a6e4442d40803f8d5361e5f942502df3 100644
--- a/doc/fluid/howto/optimization/timeline.md
+++ b/doc/fluid/howto/optimization/timeline.md
@@ -1,4 +1,4 @@
-## how to use timeline tool to do profile
+# how to use timeline tool to do profile
 
 1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
 
diff --git a/doc/fluid/index_cn.rst b/doc/fluid/index_cn.rst
index be3bed4393a7346d4f2a53e2c7409ee7165fb5b6..d878d192cae7ee9e8b8fdb4f615839c186fdf334 100644
--- a/doc/fluid/index_cn.rst
+++ b/doc/fluid/index_cn.rst
@@ -5,8 +5,8 @@
   :maxdepth: 1
 
   getstarted/index_cn.rst
-  design/index_cn.rst
   build_and_install/index_cn.rst
+  design/index_cn.rst
   howto/index_cn.rst
   dev/index_cn.rst
   faq/index_cn.rst
diff --git a/doc/fluid/index_en.rst b/doc/fluid/index_en.rst
index 87c831420a57b4b9ce77ecf44f7f4d0feec833a6..2bc76b58982cf50e637d15cca0c5d78166aa73a9 100644
--- a/doc/fluid/index_en.rst
+++ b/doc/fluid/index_en.rst
@@ -5,8 +5,8 @@
   :maxdepth: 1
 
   getstarted/index_en.rst
-  design/index_en.rst
   build_and_install/index_en.rst
+  design/index_en.rst
   howto/index_en.rst
   dev/index_en.rst
   faq/index_en.rst
diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt
index 286fe8845cd7a909d4030540e72362864b536063..82de7a3a3e1ca7724e1eda877d53454a4fa4129a 100644
--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
@@ -20,13 +20,15 @@ configure_file(
     "${BINARY_BUILD_DIR_EN}/conf.py"
     @ONLY)
 
-sphinx_add_target(paddle_docs
+sphinx_add_target(paddle_v2_docs
                   html
                   ${BINARY_BUILD_DIR_EN}
                   ${SPHINX_CACHE_DIR_EN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
+add_dependencies(paddle_v2_docs gen_proto_py)
+
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -41,11 +43,13 @@ configure_file(
     "${BINARY_BUILD_DIR_CN}/conf.py"
     @ONLY)
 
-sphinx_add_target(paddle_docs_cn
+sphinx_add_target(paddle_v2_docs_cn
                   html
                   ${BINARY_BUILD_DIR_CN}
                   ${SPHINX_CACHE_DIR_CN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
 
+add_dependencies(paddle_v2_docs_cn gen_proto_py)
+
 add_subdirectory(api)
diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt
index 2ad589e8a260e48d46cba2300d6e2bcd4bdd8019..da1eafc02ed8cd155d4f0f1fbadcb7b237b6fcc1 100644
--- a/doc/v2/api/CMakeLists.txt
+++ b/doc/v2/api/CMakeLists.txt
@@ -12,9 +12,11 @@ configure_file(
     "${BINARY_BUILD_DIR_EN}/conf.py"
     @ONLY)
 
-sphinx_add_target(paddle_api_docs
+sphinx_add_target(paddle_v2_apis
                   html
                   ${BINARY_BUILD_DIR_EN}
                   ${SPHINX_CACHE_DIR_EN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
diff --git a/doc/fluid/design/interface/00.why_plain_c.md b/doc/v2/design/interface/00.why_plain_c.md
similarity index 100%
rename from doc/fluid/design/interface/00.why_plain_c.md
rename to doc/v2/design/interface/00.why_plain_c.md
diff --git a/doc/fluid/design/interface/01.inference_implementation.md b/doc/v2/design/interface/01.inference_implementation.md
similarity index 100%
rename from doc/fluid/design/interface/01.inference_implementation.md
rename to doc/v2/design/interface/01.inference_implementation.md
diff --git a/doc/v2/design/interface/index_cn.rst b/doc/v2/design/interface/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2509a5c5f4182d8ce3a16a3b7bd92c0d7bf5b056
--- /dev/null
+++ b/doc/v2/design/interface/index_cn.rst
@@ -0,0 +1,7 @@
+多语言接口
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  00.why_plain_c.md
diff --git a/doc/v2/design/interface/index_en.rst b/doc/v2/design/interface/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..356e58c39c5ef6ee5ee50ab999b85f88628bfb85
--- /dev/null
+++ b/doc/v2/design/interface/index_en.rst
@@ -0,0 +1,7 @@
+Multilingual Interface
+-----------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  00.why_plain_c.md
diff --git a/doc/v2/design/mkl/mkldnn.md b/doc/v2/design/mkl/mkldnn.md
index e2fe1e6b26ffa73fda81863abfadf697c0acbfcf..1bd2e7bc34ee79eb753b3520d97e5e7beca89b0b 100644
--- a/doc/v2/design/mkl/mkldnn.md
+++ b/doc/v2/design/mkl/mkldnn.md
@@ -44,7 +44,7 @@ MKL，MKLML以及MKL-DNN三者关系如下表：
 
 | Name        |  Open Source     | License     | Descriptions  |
 | :---------- | :--------------- | :---------- | :------------ |
-|   MKL       |     No           | Proprietary | Accelerate math processing routines | 
+|   MKL       |     No           | Proprietary | Accelerate math processing routines |
 |   MKLML     |     No           | Proprietary | Small package of MKL, especially for Machine Learning |
 |   MKL-DNN   |     Yes          | Apache 2.0  | Accelerate primitives processing routines especially for Deep Neural Networks  |
 
@@ -89,7 +89,7 @@ PaddlePaddle/Paddle
 ### CMake
 在`CMakeLists.txt`中提供一个与MKL有关的总开关：`WITH_MKL`，它负责决定编译时是否使用MKLML和MKL-DNN
 
-- `WITH_MKLML` 控制是否使用MKLML库。 
+- `WITH_MKLML` 控制是否使用MKLML库。
 当打开`WITH_MKL`时，会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。
 编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
 MKLML的库目前都是动态库，主要包括`libiomp5.so`和`libmklml_intel.so`。
@@ -172,7 +172,7 @@ if use_mkldnn
     self.layer_type = mkldnn_*
 ```
 
-所有MKL-DNN的`layer_type`会以*mkldnn_*开头，这些会在`MKLDNN*Layer`注册layer的时候保证，以示区分。 
+所有MKL-DNN的`layer_type`会以*mkldnn_*开头，这些会在`MKLDNN*Layer`注册layer的时候保证，以示区分。
 
 同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
 
diff --git a/doc/v2/dev/index_en.rst b/doc/v2/dev/index_en.rst
index 549f5fa9aace7eb699d229e5f61fe10ae4ed4d66..36516b7953224e799e1065fd7930509eec0aa650 100644
--- a/doc/v2/dev/index_en.rst
+++ b/doc/v2/dev/index_en.rst
@@ -1,9 +1,27 @@
 Development
 ------------
 
+
+PaddlePaddle adheres to the following three sections of code and document specifications.
+
+
+PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages，which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development.
 ..  toctree::
   :maxdepth: 1
 
   contribute_to_paddle_en.md
+
+
+PaddlePaddle is well documented in English and Chinese. We recommend using the English version of the documents and problem description. The design documents focus on problem descriptions, backgrounds, and are followed by solutions. As documents are generated by Sphinx, code comments should comply with the Sphinx documentation standard. We recommend to use the paddlepaddle.org tool to compile and generate and preview documents locally. Please refer to:
+
+..  toctree::
+  :maxdepth: 1
+
   write_docs_en.rst
+
+PaddlePaddle V2 defines new operations by adding new Layers. You can implement various complex layers by combining basic APIs to satisfy most applications. If you want to customize layer, please refer to the following, and welcome to propose patch.
+
+..  toctree::
+  :maxdepth: 1
+
   new_layer_en.rst
diff --git a/doc/v2/faq/build_and_install/index_cn.rst b/doc/v2/faq/build_and_install/index_cn.rst
index 7c7e896d187e4fe1544d7ec933fa4fa9f24df3cd..f292684fb5fe2df06db5239e7f43fdfa1dd2f2bd 100644
--- a/doc/v2/faq/build_and_install/index_cn.rst
+++ b/doc/v2/faq/build_and_install/index_cn.rst
@@ -139,3 +139,77 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二
     touch ../extern_mklml-stamp/extern_mklml-download
 
     // 4. 接着编译即可
+
+9. 在Mac上无法安装numpy等Python包，权限错误
+------------------
+
+Mac上对自带的Python和包有严格的权限保护，最好不要在自带的Python上安装。建议用virtualenv建立一个新的Python环境来操作。
+
+virtualenv的基本原理是将机器上的Python运行所需的运行环境完整地拷贝一份。我们可以在一台机器上制造多份拷贝，并在这多个拷贝之间自由切换，这样就相当于在一台机器上拥有了多个相互隔离、互不干扰的Python环境。
+
+下面简单介绍下如何用virtualenv为Paddle生成一个专用的Python环境：
+
+安装virtualenv：
+::::::::::::::::
+
+virtualenv本身也是Python的一个包，可以用pip进行安装：
+
+..  code-block:: bash
+
+    sudo -H pip install virtualenv
+
+由于virtualenv需要安装给系统自带的Python，因此需要使用sudo权限。
+
+创建一个新的Python运行环境：
+:::::::::::::::::::
+
+..  code-block:: bash
+
+    virtualenv --no-site-packages paddle
+
+--no-site-packages 参数表示不拷贝已有的任何第三方包，创造一个完全干净的新Python环境。后面的paddle是我们为这个新创建的环境取的名字。
+
+执行完这一步后，当前目录下应该会出现一个名为paddle（或者你取的其他名字）的目录。这个目录里保存了运行一个Python环境所需要的各种文件。
+
+启动运行环境：
+::::::::::::::::
+
+..  code-block:: bash
+
+    source paddle/bin/activate
+
+执行后会发现命令提示符前面增加了(paddle)字样，说明已经成功启动了名为‘paddle’的Python环境。执行which python，可以发现使用的已经是刚刚创建的paddle目录下的Python。
+
+在这个环境中，我们可以自由地进行Paddle的安装、使用和开发工作，无需担心对系统自带Python的影响。
+
+退出运行环境：
+:::::::::::::::
+
+直接执行：
+
+..  code-block:: bash
+
+    deactivate
+
+可以看到命令提示符前面的(paddle)字样消失。
+
+自动启动某一Python环境：
+::::::::::::::::
+
+如果我们经常使用Paddle，我们每次打开终端后都需要执行一下source paddle/bin/activate来启动环境，比较繁琐。为了简便，可以修改终端的配置文件，来让终端每次启动后自动启动特定的Python环境。
+
+执行:
+
+..  code-block:: bash
+
+    vi ~/.bash_profile
+
+打开终端配置文件，并在文件的最后添加一行：
+
+..  code-block:: bash
+
+    source paddle/bin/activate
+
+保存并关闭文件。
+
+这样，每次打开终端时就会自动启动名为‘paddle’的Python环境了。
diff --git a/doc/v2/faq/build_and_install/index_en.rst b/doc/v2/faq/build_and_install/index_en.rst
index 614db457d715665073cec1a495d4d7df6887532f..7488ed8137d57785f36b9f1e1ed1269f864960bc 100644
--- a/doc/v2/faq/build_and_install/index_en.rst
+++ b/doc/v2/faq/build_and_install/index_en.rst
@@ -1,5 +1,143 @@
-############################
-Install, Build and Unit test
-############################
+.. _install_faq:
 
-TBD
+###############################
+Compile, Install, and Unit Test
+###############################
+
+..  contents::
+
+1. Insufficient CUDA driver version
+----------------------------------------------------------------
+
+Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory.
+You can solve the issue by running the following commands:
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_en.html>`_ .
+
+
+2. Version mismatch between PythonLibs and PythonInterpreter
+----------------------------------------------------------------
+
+It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows.
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+You should specify ``<exc_path>``, ``<lib_path>``, ``<inc_path>`` to your local paths.
+
+3. PaddlePaddle version is 0.0.0
+------------------------------------------------
+This issue would happen when you run the code  `paddle version` or `cmake ..`
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+
+You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake`
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1.
+
+You can upgrade Pip with the following command\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation.
+
+If the system supports :code:`linux_x86_64` and  the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest 
+
+if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again.
+
+
+5. ImportError: No module named v2
+----------------------------------
+Please uninstall Paddle V1 if you have installed it before.
+
+..  code-block:: bash
+
+    pip uninstall py_paddle paddle
+
+Then install Python for PaddlePaddle , enter the build directory and run the following commands
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. Illegal instruction
+-----------------------
+This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version.
+
+7.  Python unittest fails
+--------------------------------
+
+If the following python unittest testcases fail:
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+
+Please check the PaddlePaddle unittest logs which may suggest the following:
+
+..  code-block:: bash
+
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+The solution is:
+
+* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory.  Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package.
+
+
+8. Failed to download the MKLML library
+----------------------------------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2
+    make[1]: *** waiting for the unfinished  jobs....
+
+Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully.
+
+The solution is: manually download and install, the specific steps are as follows.
+
+..  code-block:: bash
+
+    // 1. enter the directory
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. check the size of the package, normally 75M, if less than 75M, the download fails
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. manually download and unzip and make the download success tag:
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. then compile
+    
diff --git a/doc/v2/faq/cluster/index_en.rst b/doc/v2/faq/cluster/index_en.rst
index 855b7e8e53307b82a72c156be4ef509e27edf822..fa942a09625bef78b28456beeb735272b686e061 100644
--- a/doc/v2/faq/cluster/index_en.rst
+++ b/doc/v2/faq/cluster/index_en.rst
@@ -2,4 +2,15 @@
 Cluster Training and Prediction
 ###############################
 
-TBD
+.. contents::
+
+1. Network connection errors in the log during multi-node cluster training
+------------------------------------------------
+There are maybe some errors in the log belonging to network connection problem during multi-node cluster training, for example, :code:`Connection reset by peer`.
+This kind of error is usually caused by the abnormal exit of a training process in some node, and the other nodes cannot connect with this node any longer. Steps to troubleshoot the problem are as follows:
+
+* Find the first error in the :code:`train.log`, :code:`server.log`, check whether other fault casued the problem, such as FPE, lacking of memory or disk.
+
+* If the first error in server.log says "Address already used", this may be caused by the port conflict of the non-exclusive execution. Connect the sys-admin to check if the current MPI cluster supports jobs submitted with parameter :code:`resource=full`. If the current MPI cluster does not support this parameter, change the server port and try agian.
+
+* If the current MPI cluster does not support exclusive pattern which allows a process to occupy the whole node, ask the administrator to replace or update the this cluster.
diff --git a/doc/v2/faq/model/index_en.rst b/doc/v2/faq/model/index_en.rst
index cb26f59655f97dc28a2047994643ae16b8857964..67a33e08e192e5627ac3b0abd76e979f21ed2079 100644
--- a/doc/v2/faq/model/index_en.rst
+++ b/doc/v2/faq/model/index_en.rst
@@ -2,4 +2,80 @@
 Model Configuration
 ###################
 
-TBD
+..  contents::
+
+1. How to deal with error :code:`Duplicated layer name`
+----------------------------------------------------------
+
+The general reason for this error is that users may have set the same value for the attribute :code:`name` in different layers. Try to find out the :code:`name` attribute with the same value in diffrent layers and set them differently.
+
+2. How to use :code:`paddle.layer.memory`'s attribute :code:`name`
+----------------------------------------------------------------------
+
+* :code:`paddle.layer.memory` is used to get the output of a layer's last timestep and the layer is specified by the attribute :code:`name` . Thus,  :code:`paddle.layer.memory` will associate with the layer that has the same value of attribute :code:`name` , and uses the output of the layer's last timestep as the input of its current timestep.
+
+* All the PaddlePaddle's layers have a unique name, which is set by the attribute :code:`name` . PaddlePaddle will automatically set it for the user when it is not explicitly set. :code:`paddle.layer.memory` is not a real layer, its name is set by the attribute :code:`memory_name`  and PaddlePaddle will also automatically set it when the user does not explicitly set. The :code:`paddle.layer.memory` attribute :code:`name` is used to specify the layer it is associated with, and needs to be explicitly set by the user.
+
+
+3. What is the difference between the two ways of using dropout
+-----------------------------------------------------------------
+
+* There are two ways to use dropout in PaddlePaddle
+
+  * Set the :code:`drop_rate` parameter in the layer's :code:`layer_atter` attribute. Take :code:`paddle.layer.fc` as an example:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
+
+  * Use :code:`paddle.layer.dropout` layer. Take :code:`paddle.layer.fc` as an example:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input)
+      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
+
+* :code:`paddle.layer.dropout` actually uses the :code:`paddle.layer.add_to` layer and sets :code:`drop_rate` as the previous method. This method is very memory intensive.
+
+* PaddlePaddle implements dropout in the activation function rather than in the layer.
+
+* :code:`paddle.layer.lstmemory`, :code:`paddle.layer.grumemory`, :code:`paddle.layer.recurrent` implement activation of output in an unusual way, so we cannot use dropout by setting :code:`drop_rate` . To use dropout for these layers, we could use the second method, which is to use :code:`paddle.layer.dropout`.
+
+4. The differences between different recurrent layers
+--------------------------------------------------------
+Take LSTM as an example. There are several kinds of recurrent layers in PaddlePaddle:
+
+* :code:`paddle.layer.lstmemory`
+* :code:`paddle.networks.simple_lstm`
+* :code:`paddle.networks.lstmemory_group`
+* :code:`paddle.networks.bidirectional_lstm`
+
+According to implementations, recurrent layer can be classified into 2 types:
+
+1. Recurrent layer implemented by recurrent_group:
+
+  * Using this type of recurrent layers, users can access the intermediate value calculated by the recurrent unit within a timestep (eg: hidden states, memory cells, etc.)
+  * :code:`paddle.networks.lstmemory_group` belongs to this type of recurrent layers.
+
+2. Recurrent layer implemented as a complete operation：
+
+  * Users can only access output values when using this type of recurrent layers.
+  * :code:`paddle.networks.lstmemory_group` , :code:`paddle.networks.simple_lstm` and  :code:`paddle.networks.bidirectional_lstm` belong to this type of recurrent layer；
+
+By implementing recurrent layer as a complete operation, CPU and GPU calculations can be optimized. Therefore, the second type of recurrent layer is more efficient than the first one. In practical applications, we propose to use the second type of recurrent layers if there is no need to access the intermediate variable of LSTM.
+
+In addition, PaddlePaddle also contains a kind of LSTM calculation unit: :code:`paddle.networks.lstmemory_unit`:
+
+  * Unlike the recurrent layer described above, :code:`paddle.networks.lstmemory_unit` defines the computational process of an LSTM unit in a timestep. It is not a complete recurrent layer, nor can it receive sequence data as input.
+  * :code:`paddle.networks.lstmemory_unit` can only be used as a step function in recurrent_group.
+
+5. Can Softmax's calculation dimension be specified？
+--------------------------------------------------------------------
+
+We can't specify calculation dimension for PaddlePaddle's softmax. It can only be calculated by rows.
+In image tasks, for NCHW, if you need to calculate softmax in C dimension, you could use :code:`paddle.layer.switch_order` to change the dimension order, that is, convert NCHW to NHWC, then do the reshape operation and calculate softmax.
+
+6. Does PaddlePaddle support variable-dimensional data inputs
+----------------------------------------------------------------
+
+PaddlePaddle provides :code:`paddle.data_type.dense_array` to support variable-dimensional data input. Simply set the dimension of the data layer to a value larger than the dimension of the input data for occupancy.
diff --git a/doc/v2/howto/cmd_parameter/index_en.rst b/doc/v2/howto/cmd_parameter/index_en.rst
index 0e3c72d27aca063f1b6f1c23e55718dba373c40a..f49683948ef78f363e2439cc25332431830eeb24 100644
--- a/doc/v2/howto/cmd_parameter/index_en.rst
+++ b/doc/v2/howto/cmd_parameter/index_en.rst
@@ -2,10 +2,25 @@
 
 Set Command-line Parameters
 ===========================
+The implementation of deep learning algorithms has a variety of characteristics, such as running environment, running stage, structure of the model and the traning strategy. PaddlePaddle supports the user to set various command-line parameters flexibly, which helps to achieve control of the model training or prediction process.
+
+In this part, we take several actual scenarios as an example, and the use of some command-line parameters is displayed:
 
 ..  toctree::
   :maxdepth: 1
 
   use_case_en.md
+
+Then, we summarize and classify the use of all command-line parameters:
+
+..  toctree::
+  :maxdepth: 1
+
   arguments_en.md
+
+Finally, the detailed descriptions are given, and we try to explain the propeties and significance of these command-line parameters in detail:
+
+..  toctree::
+  :maxdepth: 1
+
   detail_introduction_en.md
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index d2a4b1335464f553a361728e64ed5ca177ca53da..c44f8a8a8ecc1ba1f886fc41aec863b4ca3458a6 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT WITH_FLUID)
+if(NOT WITH_FLUID_ONLY)
   add_subdirectory(cuda)
   add_subdirectory(function)
   add_subdirectory(utils)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index a4ea74a6d2fbc29dc33a6b57ee453f49ed36c7fa..c425c71160a8fa3830a5fbdae1baaed850710877 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(details)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
 
@@ -87,6 +88,9 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
 framework_proto backward glog lod_rank_table feed_fetch_method)
 
+
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
+
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index 3693bc25d81a8309df1a6ddf3d9b08d484596ea9..fbe08349c37c4fde115ceea954ba2b84880088d7 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -147,15 +147,52 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
   if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
     return;
   }
+  auto get_vars = [](std::deque<std::unique_ptr<OpDesc>>::iterator &op,
+                     std::vector<std::string> &v) {
+    auto in_names = (*op)->InputArgumentNames();
+    v.insert(v.end(), in_names.begin(), in_names.end());
+    auto out_names = (*op)->OutputArgumentNames();
+    v.insert(v.end(), out_names.begin(), out_names.end());
+    std::sort(v.begin(), v.end());
+    auto last = std::unique(v.begin(), v.end());
+    v.erase(last, v.end());
+  };
   need_update_ = true;
-  for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
-    auto names = (*it)->InputArgumentNames();
-    for (auto n : names) {
-      // TODO(typhoonzero): delete vars if no other op use it.
-      VLOG(3) << "deleting var " << n;
+
+  for (size_t i = s; i < e; i++) {
+    // since remove op one by one, every time remove the first op.
+    auto op = ops_.begin() + s;
+
+    // collect input and output variables from current delete op
+    std::vector<std::string> cur_vars;
+    get_vars(op, cur_vars);
+
+    // remove current op
+    ops_.erase(ops_.begin() + s);
+
+    // collect input and output variables from other ops
+    std::vector<std::string> other_vars;
+    for (auto it = ops_.begin(); it != ops_.end(); it++) {
+      get_vars(it, other_vars);
+    }
+
+    // variables should be deleted
+    std::vector<std::string> delete_vars;
+    // delete_vars = cur_vars -  cur_vars ^ other_input_vars
+    std::set_difference(cur_vars.begin(), cur_vars.end(), other_vars.begin(),
+                        other_vars.end(),
+                        std::inserter(delete_vars, delete_vars.end()));
+    // remove variables
+    for (size_t i = 0; i < delete_vars.size(); i++) {
+      auto name = delete_vars[i];
+      auto it = vars_.find(name);
+      PADDLE_ENFORCE(it != vars_.end(),
+                     "%s is not in variable list, it should not be deleted",
+                     name);
+      vars_.erase(it);
+      VLOG(3) << "deleting variable " << name;
     }
   }
-  ops_.erase(ops_.begin() + s, ops_.begin() + e);
 }
 
 std::vector<OpDesc *> BlockDesc::AllOps() const {
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 185f018ac1b5863e0ee86fdaa17df1ccbc6e030e..468423e0e8e7b8c9ebc14b7568c9c3bd21645ea7 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -89,6 +89,11 @@ class BlockDesc {
 
   OpDesc *InsertOp(size_t index);
 
+  /*
+   * Remove Op and its input/output variables.
+   * Note that for either input or ouput variable, if it is also an input or
+   * output variable of other ops, we should remain it.
+   */
   void RemoveOp(size_t s, size_t e);
 
   std::vector<OpDesc *> AllOps() const;
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index adfaba26ace78f547161ad4029a741f3ca8a6764..019bea600f496a6b58579ad0aa8af836cd6134a9 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -34,7 +34,7 @@ class Channel {
  public:
   virtual bool CanSend() = 0;
   virtual bool CanReceive() = 0;
-  virtual bool Send(T*) = 0;
+  virtual void Send(T*) = 0;
   virtual bool Receive(T*) = 0;
   virtual size_t Cap() = 0;
   virtual void Lock() = 0;
@@ -84,69 +84,81 @@ class ChannelHolder {
   }
 
   template <typename T>
-  bool Send(T* data) {
-    if (!IsInitialized()) return false;
-    PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T)));
+  void Send(T* data) {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    PADDLE_ENFORCE_EQ(
+        holder_->Type(), std::type_index(typeid(T)),
+        "Channel type is not same as the type of the data being sent");
     // Static cast should be safe because we have ensured that types are same
     Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    return channel != nullptr ? channel->Send(data) : false;
+    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
+    channel->Send(data);
   }
 
   template <typename T>
   bool Receive(T* data) {
-    if (!IsInitialized()) return false;
-    PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T)));
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    PADDLE_ENFORCE_EQ(
+        holder_->Type(), std::type_index(typeid(T)),
+        "Channel type is not same as the type of the data being sent");
     Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    return channel != nullptr ? channel->Receive(data) : false;
+    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
+    return channel->Receive(data);
   }
 
   bool IsClosed() {
-    if (IsInitialized()) {
-      return holder_->IsClosed();
-    }
-    return false;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->IsClosed();
   }
 
   bool CanSend() {
-    if (IsInitialized()) {
-      return holder_->CanSend();
-    }
-    return false;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->CanSend();
   }
 
   bool CanReceive() {
-    if (IsInitialized()) {
-      return holder_->CanReceive();
-    }
-    return false;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->CanReceive();
   }
 
   void close() {
-    if (IsInitialized()) holder_->Close();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Close();
   }
 
   size_t Cap() {
-    if (IsInitialized()) return holder_->Cap();
-    return -1;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->Cap();
   }
 
   void Lock() {
-    if (IsInitialized()) holder_->Lock();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Lock();
   }
 
   void Unlock() {
-    if (IsInitialized()) holder_->Unlock();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Unlock();
   }
 
   template <typename T>
   void AddToSendQ(const void* referrer, T* data,
                   std::shared_ptr<std::condition_variable_any> cond,
                   std::function<bool(ChannelAction)> cb) {
-    if (IsInitialized()) {
-      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-      if (channel != nullptr) {
-        channel->AddToSendQ(referrer, data, cond, cb);
-      }
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+    if (channel != nullptr) {
+      channel->AddToSendQ(referrer, data, cond, cb);
     }
   }
 
@@ -154,26 +166,31 @@ class ChannelHolder {
   void AddToReceiveQ(const void* referrer, T* data,
                      std::shared_ptr<std::condition_variable_any> cond,
                      std::function<bool(ChannelAction)> cb) {
-    if (IsInitialized()) {
-      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-      if (channel != nullptr) {
-        channel->AddToReceiveQ(referrer, data, cond, cb);
-      }
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+    if (channel != nullptr) {
+      channel->AddToReceiveQ(referrer, data, cond, cb);
     }
   }
 
   void RemoveFromSendQ(const void* referrer) {
-    if (IsInitialized()) holder_->RemoveFromSendQ(referrer);
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->RemoveFromSendQ(referrer);
   }
 
   void RemoveFromReceiveQ(const void* referrer) {
-    if (IsInitialized()) holder_->RemoveFromReceiveQ(referrer);
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->RemoveFromReceiveQ(referrer);
   }
 
   inline bool IsInitialized() const { return holder_ != nullptr; }
 
   inline const std::type_index Type() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true);
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
     return holder_->Type();
   }
 
diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h
index 457abbf373d4549229e8fd8bd6b2087cc6b8f5c8..e056779ea0dd0a31191b628f82724298efaf50ff 100644
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -31,7 +31,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
  public:
   virtual bool CanSend();
   virtual bool CanReceive();
-  virtual bool Send(T *);
+  virtual void Send(T *);
   virtual bool Receive(T *);
   virtual size_t Cap() { return cap_; }
   virtual void Lock();
@@ -76,10 +76,9 @@ class ChannelImpl : public paddle::framework::Channel<T> {
     }
   };
 
-  bool send_return(bool value) {
+  void send_return() {
     send_ctr--;
     destructor_cond_.notify_all();
-    return value;
   }
 
   bool recv_return(bool value) {
@@ -88,6 +87,21 @@ class ChannelImpl : public paddle::framework::Channel<T> {
     return value;
   }
 
+  std::shared_ptr<QueueMessage> get_first_message(
+      std::deque<std::shared_ptr<QueueMessage>> &queue, ChannelAction action) {
+    while (!queue.empty()) {
+      // Check whether this message was added by Select
+      // If this was added by Select then execute the callback
+      // to check if you can execute this message. The callback
+      // can return false if some other case was executed in Select.
+      // In that case just discard this QueueMessage and process next.
+      std::shared_ptr<QueueMessage> m = queue.front();
+      queue.pop_front();
+      if (m->callback == nullptr || m->callback(action)) return m;
+    }
+    return nullptr;
+  }
+
   size_t cap_;
   std::recursive_mutex mu_;
   bool closed_;
@@ -118,45 +132,33 @@ bool ChannelImpl<T>::CanReceive() {
 }
 
 template <typename T>
-bool ChannelImpl<T>::Send(T *item) {
+void ChannelImpl<T>::Send(T *item) {
   send_ctr++;
   std::unique_lock<std::recursive_mutex> lock{mu_};
 
-  // If channel is closed, do nothing
+  // If channel is closed, throw exception
   if (closed_) {
+    send_return();
     lock.unlock();
-    // TODO(abhinavarora) Should panic on closed channel
-    return send_return(false);
+    PADDLE_THROW("Cannot send on closed channel");
   }
 
   // If there is a receiver, directly pass the value we want
   // to send to the receiver, bypassing the channel buffer if any
   if (!recvq.empty()) {
-    std::shared_ptr<QueueMessage> m = recvq.front();
-    recvq.pop_front();
-    // Do the data transfer
-    // We will do this data transfer if either of the following
-    // cases are true
-    // 1. callback == nullptr // This means it was a regular channel send
-    // 2. callback returns true
-    bool do_send = true;
-    if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
-    if (do_send)
+    std::shared_ptr<QueueMessage> m =
+        get_first_message(recvq, ChannelAction::SEND);
+
+    if (m != nullptr) {
       *(m->data) = std::move(*item);
-    else
-      // We cannot do the data transfer because
-      // this QueueMessage was added by Select
-      // and some other case was executed.
-      // So call the Send function again.
-      // We do not care about notifying other
-      // because they would have been notified
-      // by the executed select case.
-      return send_return(Send(item));
-
-    // Wake up the blocked process and unlock
-    m->Notify();
-    lock.unlock();
-    return send_return(true);
+      m->Notify();
+      send_return();
+      return;
+    } else {
+      Send(item);
+      send_return();
+      return;
+    }
   }
 
   // Unbuffered channel will always bypass this
@@ -165,9 +167,8 @@ bool ChannelImpl<T>::Send(T *item) {
   if (buf_.size() < cap_) {
     // Copy to buffer
     buf_.push_back(std::move(*item));
-    // Release lock and return true
-    lock.unlock();
-    return send_return(true);
+    send_return();
+    return;
   }
 
   // Block on channel, because some receiver will complete
@@ -175,8 +176,12 @@ bool ChannelImpl<T>::Send(T *item) {
   auto m = std::make_shared<QueueMessage>(item);
   sendq.push_back(m);
   m->Wait(lock);
-  // TODO(abhinavarora) Should panic on closed channel
-  return send_return(!m->chan_closed);
+  if (m->chan_closed) {
+    send_return();
+    lock.unlock();
+    PADDLE_THROW("Cannot send on closed channel");
+  }
+  send_return();
 }
 
 template <typename T>
@@ -186,39 +191,37 @@ bool ChannelImpl<T>::Receive(T *item) {
 
   // If channel is closed and buffer is empty or
   // channel is unbuffered
-  if (closed_ && buf_.empty()) {
-    lock.unlock();
-    return recv_return(false);
-  }
+  if (closed_ && buf_.empty()) return recv_return(false);
 
   // If there is a sender, directly receive the value we want
-  // from the sender, bypassing the channel buffer if any
+  // from the sender. In case of a buffered channel, read from
+  // buffer and move front of send queue to the buffer
   if (!sendq.empty()) {
-    std::shared_ptr<QueueMessage> m = sendq.front();
-    sendq.pop_front();
-    // Do the data transfer
-    // We will do this data transfer if either of the following
-    // cases are true
-    // 1. callback == nullptr // This means it was a regular channel send
-    // 2. callback returns true
-    bool do_receive = true;
-    if (m->callback != nullptr)
-      do_receive = m->callback(ChannelAction::RECEIVE);
-    if (do_receive)
-      *item = std::move(*(m->data));
-    else
-      // We cannot do the data transfer because
-      // this QueueMessage was added by Select
-      // and some other case was executed.
-      // So call the Receive function again.
-      // We do not care about notifying other
-      // because they would have been notified
-      // by the executed select case.
-      return recv_return(Receive(item));
-
-    // Wake up the blocked process and unlock
-    m->Notify();
-    lock.unlock();
+    std::shared_ptr<QueueMessage> m =
+        get_first_message(sendq, ChannelAction::RECEIVE);
+    if (buf_.size() > 0) {
+      // Case 1 : Channel is Buffered
+      // Do Data transfer from front of buffer
+      // and add a QueueMessage to the buffer
+      *item = std::move(buf_.front());
+      buf_.pop_front();
+      // If first message from sendq is not null
+      // add it to the buffer and notify it
+      if (m != nullptr) {
+        // Copy to buffer
+        buf_.push_back(std::move(*(m->data)));
+        m->Notify();
+      }  // Ignore if there is no first message
+    } else {
+      // Case 2: Channel is Unbuffered
+      // Do data transfer from front of SendQ
+      // If front is nullptr, then recursively call itself
+      if (m != nullptr) {
+        *item = std::move(*(m->data));
+        m->Notify();
+      } else
+        return recv_return(Receive(item));
+    }
     return recv_return(true);
   }
 
@@ -227,8 +230,7 @@ bool ChannelImpl<T>::Receive(T *item) {
     // Directly read from buffer
     *item = std::move(buf_.front());
     buf_.pop_front();
-    // Release lock and return true
-    lock.unlock();
+    // return true
     return recv_return(true);
   }
 
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
index 73be5cdbe2a1f5994ecee4c415e83962f50532fe..1184bfdae1940286fb72d9091ae4f23ff7f84a54 100644
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <chrono>
 #include <thread>
-
 #include "gtest/gtest.h"
 
 using paddle::framework::Channel;
@@ -37,23 +36,25 @@ TEST(Channel, ChannelCapacityTest) {
   delete ch;
 }
 
-void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
+void RecevingOrderEqualToSendingOrder(Channel<int> *ch, int num_items) {
   unsigned sum_send = 0;
   std::thread t([&]() {
-    for (int i = 0; i < 5; i++) {
-      EXPECT_EQ(ch->Send(&i), true);
+    for (int i = 0; i < num_items; i++) {
+      ch->Send(&i);
       sum_send += i;
     }
   });
-  for (int i = 0; i < 5; i++) {
-    int recv = 999;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));
+  for (int i = 0; i < num_items; i++) {
+    int recv = -1;
     EXPECT_EQ(ch->Receive(&recv), true);
     EXPECT_EQ(recv, i);
   }
   std::this_thread::sleep_for(std::chrono::milliseconds(200));
   CloseChannel(ch);
   t.join();
-  EXPECT_EQ(sum_send, 10U);
+  unsigned expected_sum = (num_items * (num_items - 1)) / 2;
+  EXPECT_EQ(sum_send, expected_sum);
   delete ch;
 }
 
@@ -61,7 +62,7 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
   const size_t buffer_size = 10;
   auto ch = MakeChannel<size_t>(buffer_size);
   for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Send(&i), true);  // should not block
+    ch->Send(&i);
   }
 
   size_t out;
@@ -82,7 +83,7 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
   const size_t data = 5;
   std::thread send_thread{[&]() {
     size_t i = data;
-    EXPECT_EQ(ch->Send(&i), true);  // should not block
+    ch->Send(&i);  // should not block
   }};
 
   std::thread recv_thread{[&]() {
@@ -94,12 +95,18 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
   send_thread.join();
   recv_thread.join();
 
-  // After closing send should return false. Receive should
-  // also return false as there is no data in queue.
+  // After closing send should panic. Receive should
+  // also  false as there is no data in queue.
   CloseChannel(ch);
   send_thread = std::thread{[&]() {
     size_t i = data;
-    EXPECT_EQ(ch->Send(&i), false);  // should return false
+    bool is_exception = false;
+    try {
+      ch->Send(&i);
+    } catch (paddle::platform::EnforceNotMet e) {
+      is_exception = true;
+    }
+    EXPECT_EQ(is_exception, true);
   }};
   recv_thread = std::thread{[&]() {
     size_t i;
@@ -129,7 +136,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
   auto ch = MakeChannel<size_t>(buffer_size);
 
   for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Send(&i), true);  // sending should not block
+    ch->Send(&i);  // sending should not block
   }
 
   size_t out;
@@ -160,9 +167,16 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
     // Try to write more than buffer size.
     for (size_t i = 0; i < 2 * buffer_size; ++i) {
       if (i < buffer_size)
-        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
-      else
-        EXPECT_EQ(ch->Send(&i), false);
+        ch->Send(&i);  // should block after 10 iterations
+      else {
+        bool is_exception = false;
+        try {
+          ch->Send(&i);
+        } catch (paddle::platform::EnforceNotMet e) {
+          is_exception = true;
+        }
+        EXPECT_EQ(is_exception, true);
+      }
     }
   });
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
@@ -173,12 +187,28 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
 
 TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
   auto ch = MakeChannel<int>(0);
-  RecevingOrderEqualToSendingOrder(ch);
+  RecevingOrderEqualToSendingOrder(ch, 20);
+}
+
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel1) {
+  // Test that Receive Order is same as Send Order when number of items
+  // sent is less than size of buffer
+  auto ch = MakeChannel<int>(10);
+  RecevingOrderEqualToSendingOrder(ch, 5);
 }
 
-TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) {
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel2) {
+  // Test that Receive Order is same as Send Order when number of items
+  // sent is equal to size of buffer
   auto ch = MakeChannel<int>(10);
-  RecevingOrderEqualToSendingOrder(ch);
+  RecevingOrderEqualToSendingOrder(ch, 10);
+}
+
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
+  // Test that Receive Order is same as Send Order when number of items
+  // sent is greater than the size of buffer
+  auto ch = MakeChannel<int>(10);
+  RecevingOrderEqualToSendingOrder(ch, 20);
 }
 
 void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
@@ -231,7 +261,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -316,8 +352,11 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
     // Try to send more number of times
     // than receivers
     for (int i = 0; i < 4; i++) {
-      ch->Send(&i);
-      sum_send += i;
+      try {
+        ch->Send(&i);
+        sum_send += i;
+      } catch (paddle::platform::EnforceNotMet e) {
+      }
     }
   });
   for (int i = 0; i < 3; i++) {
@@ -382,7 +421,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -508,7 +553,7 @@ void ChannelHolderSendReceive(ChannelHolder *ch) {
   unsigned sum_send = 0;
   std::thread t([&]() {
     for (int i = 0; i < 5; i++) {
-      EXPECT_EQ(ch->Send(&i), true);
+      ch->Send(&i);
       sum_send += i;
     }
   });
@@ -541,8 +586,22 @@ TEST(ChannelHolder, ChannelUninitializedTest) {
   ChannelHolder *ch = new ChannelHolder();
   EXPECT_EQ(ch->IsInitialized(), false);
   int i = 10;
-  EXPECT_EQ(ch->Send(&i), false);
-  EXPECT_EQ(ch->Receive(&i), false);
+  bool send_exception = false;
+  try {
+    ch->Send(&i);
+  } catch (paddle::platform::EnforceNotMet e) {
+    send_exception = true;
+  }
+  EXPECT_EQ(send_exception, true);
+
+  bool recv_exception = false;
+  try {
+    ch->Receive(&i);
+  } catch (paddle::platform::EnforceNotMet e) {
+    recv_exception = true;
+  }
+  EXPECT_EQ(recv_exception, true);
+
   bool is_exception = false;
   try {
     ch->Type();
@@ -669,7 +728,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -760,7 +825,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bf1a705ef50b663efa53393ead1f81fd6bcf8c48
--- /dev/null
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -0,0 +1,21 @@
+cc_library(var_handle SRCS var_handle.cc DEPS place)
+cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
+cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+        dynload_cuda)
+cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
+
+cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
+cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+
+if(WITH_GPU)
+    set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
+else()
+    set(multi_devices_graph_builder_deps)
+endif()
+cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
+            scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph)
+cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
+        simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7a1b40c0b60a788b1f0a70e688f8fcbe427ad076
--- /dev/null
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -0,0 +1,42 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+                                         platform::Place place)
+    : op_(framework::OpRegistry::CreateOp(op_desc)),
+      scope_(scope),
+      place_(place) {}
+
+void ComputationOpHandle::RunImpl() {
+  auto *cur_ctx = dev_ctxes_[place_];
+  for (auto *in : inputs_) {
+    bool need_wait =
+        in->generated_op_ && in->generated_op_->dev_ctxes_[place_] != cur_ctx;
+    if (need_wait) {
+      in->generated_op_->Wait(cur_ctx);
+    }
+  }
+
+  op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get<Scope *>(), place_);
+}
+
+std::string ComputationOpHandle::Name() const { return op_->Type(); }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6d2d731ca80a0fbc0a2a34027b5b7c3c1977c07
--- /dev/null
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -0,0 +1,41 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+struct ComputationOpHandle : public OpHandleBase {
+  std::unique_ptr<OperatorBase> op_;
+  Scope *scope_;
+  platform::Place place_;
+
+  ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+                      platform::Place place);
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9180903b864d03e59f55f41410b2240fa4199496
--- /dev/null
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -0,0 +1,79 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fetch_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+FetchOpHandle::FetchOpHandle(FeedFetchList *data, size_t offset,
+                             std::vector<Scope *> *local_scopes)
+    : data_(data), offset_(offset), local_scopes_(local_scopes) {}
+
+FetchOpHandle::~FetchOpHandle() {
+  for (auto *input_var : inputs_) {
+    input_var->pending_ops_.erase(this);
+  }
+}
+
+void FetchOpHandle::Wait(platform::DeviceContext *waited_dev) {
+  PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
+}
+
+void FetchOpHandle::WaitAndMergeCPUTensors() const {
+  std::vector<const LoDTensor *> tensors_ptr;
+  tensors_ptr.reserve(tensors_.size());
+  for (auto &t : tensors_) {
+    tensors_ptr.emplace_back(&t);
+  }
+  data_->at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace());
+}
+
+void FetchOpHandle::RunImpl() {
+  auto cpu_ctx =
+      platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+  for (auto *input : inputs_) {
+    auto *var = static_cast<VarHandle *>(input);
+    var->generated_op_->Wait(cpu_ctx);
+  }
+
+  tensors_.resize(inputs_.size());
+  auto *var = static_cast<VarHandle *>(inputs_[0]);
+  auto &var_name = var->name_;
+  platform::CPUPlace cpu;
+  auto &scopes = *local_scopes_;
+
+  for (size_t i = 0; i < scopes.size(); ++i) {
+    auto &scope = scopes[i];
+    auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
+    if (platform::is_gpu_place(var->place_)) {
+#ifdef PADDLE_WITH_CUDA
+      TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
+      dev_ctxes_[t.place()]->Wait();
+#endif
+    } else {
+      tensors_[i].ShareDataWith(t);
+      tensors_[i].set_lod(t.lod());
+    }
+  }
+
+  this->WaitAndMergeCPUTensors();
+}
+
+std::string FetchOpHandle::Name() const { return "Fetch"; }
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..904b2d669f8b156b99197afb0155380d1170a68b
--- /dev/null
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -0,0 +1,49 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct FetchOpHandle : public OpHandleBase {
+  FeedFetchList *data_;
+  size_t offset_;
+  std::vector<Scope *> *local_scopes_;
+  std::vector<LoDTensor> tensors_;
+
+  FetchOpHandle(FeedFetchList *data, size_t offset,
+                std::vector<Scope *> *local_scopes);
+
+  ~FetchOpHandle();
+
+  void Wait(platform::DeviceContext *waited_dev) override;
+
+  void WaitAndMergeCPUTensors() const;
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c277bd7cb69bba899296efe64107ee538c4aa847
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -0,0 +1,181 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/scope.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+#ifdef PADDLE_WITH_CUDA
+MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
+    const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &params,
+    const std::vector<Scope *> &local_scopes,
+    platform::NCCLContextMap *nccl_ctxs)
+    : loss_var_name_(loss_var_name),
+      places_(places),
+      local_scopes_(local_scopes),
+      nccl_ctxs_(nccl_ctxs) {
+#else
+MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
+    const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &params,
+    const std::vector<Scope *> &local_scopes)
+    : loss_var_name_(loss_var_name),
+      places_(places),
+      local_scopes_(local_scopes) {
+#endif
+  for (auto &p : params) {
+    grad_names_.insert(GradVarName(p));
+  }
+}
+
+std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
+    const ProgramDesc &program) const {
+  auto graph = new SSAGraph();
+  SSAGraph &result = *graph;
+  std::unordered_set<std::string> og_has_been_broadcast;
+  result.vars_.resize(places_.size());
+
+  bool is_forwarding = true;
+  for (auto *op : program.Block(0).AllOps()) {
+    bool change_forward = false;
+    if (!is_forwarding) {
+      // FIXME(yy): Do not hard code like this
+      if (op->OutputArgumentNames().size() == 1 &&
+          op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) {
+        continue;  // Drop fill 1. for backward coeff;
+      }
+    }
+
+    for (size_t i = 0; i < places_.size(); ++i) {
+      auto &p = places_[i];
+      auto *s = local_scopes_[i];
+
+      result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
+      auto *op_handle = result.ops_.back().get();
+      op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(p));
+
+      auto var_names = op->InputArgumentNames();
+
+      for (auto &each_var_name : var_names) {
+        VarHandle *var =
+            CreateOrGetLatestVarHandle(&result, each_var_name, p, i);
+        op_handle->AddInput(var);
+      }
+      var_names = op->OutputArgumentNames();
+
+      for (auto &each_var_name : var_names) {
+        CreateOpOutput(&result, op_handle, each_var_name, p, i);
+      }
+
+      if (is_forwarding) {
+        if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
+// Insert ScaleCost OpHandle
+#ifdef PADDLE_WITH_CUDA
+          auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
+#else
+          auto *communication_dev_ctx =
+              platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+#endif
+
+          op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
+                                                communication_dev_ctx);
+          result.ops_.emplace_back(op_handle);
+
+          // FIXME: Currently ScaleLossGradOp only use device_count as scale
+          // factor. So it does not depend on any other operators.
+          // VarHandle *loss = GetVarHandle(loss_var_name, place);
+          // loss->pending_ops_.emplace_back(op_handle);
+          // op_handle->inputs_.emplace_back(loss);
+
+          CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i);
+          change_forward = true;
+        }
+      }
+    }
+
+    if (change_forward) {
+      is_forwarding = false;
+    }
+
+    if (!is_forwarding) {
+      auto var_names = op->OutputArgumentNames();
+      // Currently, we assume that once gradient is generated, it can be
+      // broadcast, and each gradient is only broadcast once. But there are no
+      // other cases, for example, we need to adjust the gradient according to
+      // the input when we get the gradient, which is not considered at present.
+      for (auto &og : var_names) {
+        if (grad_names_.count(og) != 0 &&
+            og_has_been_broadcast.count(og) == 0) {  // is param grad
+                                                     // Insert NCCL AllReduce Op
+          og_has_been_broadcast.insert(og);
+#ifdef PADDLE_WITH_CUDA
+          result.ops_.emplace_back(
+              new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
+          auto *op_handle = result.ops_.back().get();
+
+          for (size_t i = 0; i < places_.size(); ++i) {
+            auto &p = places_[i];
+            auto &vars = result.vars_[i][og];
+
+            if (vars.empty()) {  // This device has no data. continue.
+              continue;
+            }
+            auto *prev_grad = &vars[vars.size() - 1];
+            op_handle->AddInput(prev_grad);
+
+            auto &var = vars[vars.size()];
+            var.place_ = p;
+            var.name_ = og;
+            var.version_ = vars.size() - 1;
+
+            op_handle->AddOutput(&var);
+          }
+#else
+          PADDLE_ENFORCE("Not implemented");
+#endif
+        }
+      }
+    }
+  }
+
+  /*
+    Dependency graph has been constructed. However, there are still data
+    harzaeds need to be handled.
+   */
+  PolishGraphToSupportDataHazards(&result);
+
+  if (VLOG_IS_ON(10)) {
+    std::ostringstream sout;
+    PrintGraphviz(*graph, sout);
+    VLOG(10) << sout.str();
+  }
+
+  return std::unique_ptr<SSAGraph>(graph);
+}  // namespace details
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3c8e582cf2cdf26198822e4bd2602883622df21
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -0,0 +1,56 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
+namespace paddle {
+namespace platform {
+class NCCLContextMap;
+}
+
+namespace framework {
+class Scope;
+namespace details {
+class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
+ public:
+#ifdef PADDLE_WITH_CUDA
+  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
+                          const std::string &loss_var_name,
+                          const std::unordered_set<std::string> &params,
+                          const std::vector<Scope *> &local_scopes,
+                          platform::NCCLContextMap *nccl_ctxs);
+#else
+  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
+                          const std::string &loss_var_name,
+                          const std::unordered_set<std::string> &params,
+                          const std::vector<Scope *> &local_scopes);
+#endif
+
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
+
+ private:
+  std::string loss_var_name_;
+  const std::vector<platform::Place> &places_;
+  const std::vector<Scope *> &local_scopes_;
+  std::unordered_set<std::string> grad_names_;
+
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap *nccl_ctxs_;
+#endif
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55b5f113589e090386d287e228349f22fb94a7ab
--- /dev/null
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -0,0 +1,82 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    const platform::NCCLContextMap &ctxs)
+    : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
+  for (auto &p : places_) {
+    this->dev_ctxes_[p] = nccl_ctxs_.DevCtx(p);
+  }
+}
+
+void NCCLAllReduceOpHandle::RunImpl() {
+  if (inputs_.size() == 1) {
+    return;  // No need to all reduce when GPU count = 1;
+  } else {
+    // Wait input done
+    for (auto *in : inputs_) {
+      auto &p = static_cast<VarHandle *>(in)->place_;
+      in->generated_op_->Wait(dev_ctxes_[p]);
+    }
+
+    auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_;
+    int dtype = -1;
+    size_t numel = 0;
+
+    std::vector<std::function<void()>> all_reduce_calls;
+
+    for (size_t i = 0; i < local_scopes_.size(); ++i) {
+      auto &p = places_[i];
+      auto *s = local_scopes_[i];
+      int dev_id = boost::get<platform::CUDAPlace>(p).device;
+
+      auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
+      void *buffer = const_cast<void *>(lod_tensor.data<void>());
+
+      if (dtype == -1) {
+        dtype = platform::ToNCCLDataType(lod_tensor.type());
+      }
+
+      if (numel == 0) {
+        numel = static_cast<size_t>(lod_tensor.numel());
+      }
+
+      auto &nccl_ctx = nccl_ctxs_.at(dev_id);
+      auto stream = nccl_ctx.stream();
+      auto comm = nccl_ctx.comm_;
+      all_reduce_calls.emplace_back([=] {
+        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
+            comm, stream));
+      });
+    }
+
+    platform::NCCLGroupGuard guard;
+    for (auto &call : all_reduce_calls) {
+      call();
+    }
+  }
+}
+
+std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad14a3c5cb4625fa121cad2daed389c441e78771
--- /dev/null
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -0,0 +1,50 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct NCCLAllReduceOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+  const platform::NCCLContextMap &nccl_ctxs_;
+
+  NCCLAllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                        const std::vector<platform::Place> &places,
+                        const platform::NCCLContextMap &ctxs);
+
+  std::string Name() const override;
+
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
+  bool IsMultiDeviceTransfer() override { return true; };
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e4194a7442f677ec8970dbc387bb01ebbbf579f1
--- /dev/null
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -0,0 +1,102 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+std::string OpHandleBase::DebugString() const {
+  std::stringstream ss;
+  ss << "(";
+  for (auto *var : inputs_) {
+    ss << var->DebugString() << ", ";
+  }
+  ss << ") --> (";
+  for (auto *var : outputs_) {
+    ss << var->DebugString() << ", ";
+  }
+  ss << ")\n";
+  return ss.str();
+}
+
+OpHandleBase::~OpHandleBase() {
+#ifdef PADDLE_WITH_CUDA
+  for (auto &ev : events_) {
+    PADDLE_ENFORCE(cudaEventDestroy(ev.second));
+  }
+#endif
+}
+
+void OpHandleBase::Run(bool use_event) {
+#ifdef PADDLE_WITH_CUDA
+  if (events_.empty() && use_event) {
+    for (auto &p : dev_ctxes_) {
+      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
+      PADDLE_ENFORCE(cudaSetDevice(dev_id));
+      PADDLE_ENFORCE(
+          cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
+    }
+  }
+#else
+  PADDLE_ENFORCE(!use_event);
+#endif
+
+  RunImpl();
+
+#ifdef PADDLE_WITH_CUDA
+  if (use_event) {
+    for (auto &p : dev_ctxes_) {
+      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
+      auto stream =
+          static_cast<platform::CUDADeviceContext *>(p.second)->stream();
+      PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream));
+    }
+  }
+#endif
+}
+
+void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) {
+    for (auto &dev_ctx : dev_ctxes_) {
+      dev_ctx.second->Wait();
+    }
+  } else {
+    auto stream =
+        static_cast<platform::CUDADeviceContext *>(waited_dev)->stream();
+    for (auto &ev : events_) {
+      PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
+    }
+  }
+#else
+  for (auto &dev_ctx : dev_ctxes_) {
+    dev_ctx.second->Wait();
+  }
+#endif
+}
+
+void OpHandleBase::AddInput(VarHandleBase *in) {
+  this->inputs_.emplace_back(in);
+  in->pending_ops_.insert(this);
+}
+
+void OpHandleBase::AddOutput(VarHandleBase *out) {
+  outputs_.emplace_back(out);
+  out->generated_op_ = this;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7a541ac4bb83625060db337446d03a1afda3ed0
--- /dev/null
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -0,0 +1,68 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class OpHandleBase {
+ private:
+  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
+
+ public:
+  std::vector<VarHandleBase *> inputs_;
+  std::vector<VarHandleBase *> outputs_;
+  std::unordered_map<platform::Place, platform::DeviceContext *,
+                     platform::PlaceHash>
+      dev_ctxes_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::unordered_map<int, cudaEvent_t> events_;
+#endif
+
+  OpHandleBase() {}
+
+  std::string DebugString() const;
+
+  virtual std::string Name() const = 0;
+
+  virtual ~OpHandleBase();
+
+  void Run(bool use_event);
+
+  virtual void Wait(platform::DeviceContext *waited_dev);
+
+  void AddInput(VarHandleBase *in);
+
+  void AddOutput(VarHandleBase *out);
+
+  // If the Op involves data transfer of multiple devices that
+  // will likely block other computations.
+  virtual bool IsMultiDeviceTransfer() { return false; }
+
+ protected:
+  virtual void RunImpl() = 0;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a6f6129b812ca84db7573957b1ee0a32c1ef5c4
--- /dev/null
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -0,0 +1,52 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope,
+                                             platform::Place place,
+                                             platform::DeviceContext *dev_ctx)
+    : coeff_(static_cast<float>(1.0 / num_dev)), scope_(scope), place_(place) {
+  dev_ctxes_[place_] = dev_ctx;
+}
+
+ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
+
+void ScaleLossGradOpHandle::RunImpl() {
+  std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
+
+  float *tmp =
+      scope_->FindVar(var_name)->GetMutable<LoDTensor>()->mutable_data<float>(
+          make_ddim({1}), place_);
+
+  if (platform::is_cpu_place(place_)) {
+    *tmp = coeff_;
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    auto stream =
+        static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
+            ->stream();
+    memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
+                 platform::CPUPlace(), &coeff_, sizeof(float), stream);
+#endif
+  }
+}
+
+std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab7353a4fc56bebfe04696efd838dc4559218058
--- /dev/null
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -0,0 +1,43 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ScaleLossGradOpHandle : public OpHandleBase {
+  float coeff_;
+  Scope *scope_;
+  platform::Place place_;
+
+  ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place,
+                        platform::DeviceContext *context);
+
+  ~ScaleLossGradOpHandle() final;
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph.cc b/paddle/fluid/framework/details/ssa_graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1b8c889449059c563ea39f86250075ac2537cdbe
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph.cc
@@ -0,0 +1,15 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph.h"
diff --git a/paddle/fluid/framework/details/ssa_graph.h b/paddle/fluid/framework/details/ssa_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac3e2d86993aee31b79f4481c4d5a47cd9cdf5b4
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph.h
@@ -0,0 +1,35 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct SSAGraph {
+  std::vector<std::unordered_map<std::string, std::map<int, VarHandle>>> vars_;
+  // aux variables to represent dependency. Useful to resolve data hazard.
+  std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_;
+  std::vector<std::unique_ptr<OpHandleBase>> ops_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..361ba6d39721eed406a30fea325b3b4508ec45d0
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -0,0 +1,141 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
+  for (auto &var_map : graph->vars_) {
+    for (auto &name_pair : var_map) {
+      if (name_pair.second.size() <= 1) {
+        continue;
+      }
+      auto it_new = name_pair.second.rbegin();
+      auto it_old = name_pair.second.rbegin();
+      ++it_old;
+      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
+        auto *write_op = it_new->second.generated_op_;
+        auto &read_ops = it_old->second.pending_ops_;
+
+        for (auto *read_op : read_ops) {
+          // Manually add a dependency var from read_op to write_op;
+          if (read_op == write_op) {
+            // Read Write is the same op.
+            continue;
+          }
+
+          auto *dep_var = new DummyVarHandle();
+          read_op->AddOutput(dep_var);
+          write_op->AddInput(dep_var);
+          graph->dep_vars_.emplace(dep_var);
+        }
+      }
+    }
+  }
+}
+
+VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
+    SSAGraph *graph, const std::string &each_var_name,
+    const platform::Place &place, size_t place_offset) {
+  auto &var_holders = graph->vars_[place_offset];
+  auto &var_holder = var_holders[each_var_name];
+  VarHandle *var = nullptr;
+  if (var_holder.empty()) {
+    auto &init_var = var_holder[0];
+    init_var.place_ = place;
+    init_var.name_ = each_var_name;
+    init_var.generated_op_ = nullptr;
+    init_var.version_ = 0;
+    var = &init_var;
+  } else {
+    var = &var_holder.rbegin()->second;
+  }
+  return var;
+}
+
+void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
+                                     const std::string &each_var_name,
+                                     const platform::Place &place,
+                                     size_t place_offset) {
+  auto &vars = graph->vars_[place_offset][each_var_name];
+  size_t version = vars.size();
+  auto &var = vars[version];
+  var.version_ = version;
+  var.name_ = each_var_name;
+  var.place_ = place;
+  op_handle->AddOutput(&var);
+}
+
+template <typename Callback>
+void IterAllVar(const SSAGraph &graph, Callback callback) {
+  for (auto &each : graph.vars_) {
+    for (auto &pair1 : each) {
+      for (auto &pair2 : pair1.second) {
+        callback(pair2.second);
+      }
+    }
+  }
+
+  for (auto &var : graph.dep_vars_) {
+    callback(*var);
+  }
+}
+
+void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
+  size_t var_id = 0;
+  std::unordered_map<const VarHandleBase *, size_t> vars;
+
+  sout << "digraph G {\n";
+
+  IterAllVar(graph, [&](const VarHandleBase &var) {
+    auto *var_ptr = &var;
+    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
+    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
+
+    size_t cur_var_id = var_id++;
+    vars[var_ptr] = cur_var_id;
+
+    if (var_handle_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
+           << "\\n"
+           << var_handle_ptr->place_ << "\\n"
+           << var_handle_ptr->version_ << "\"]" << std::endl;
+    } else if (dummy_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
+    }
+  });
+
+  size_t op_id = 0;
+  for (auto &op : graph.ops_) {
+    std::string op_name = "op_" + std::to_string(op_id++);
+    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
+         << std::endl;
+    for (auto in : op->inputs_) {
+      std::string var_name = "var_" + std::to_string(vars[in]);
+      sout << var_name << " -> " << op_name << std::endl;
+    }
+
+    for (auto out : op->outputs_) {
+      std::string var_name = "var_" + std::to_string(vars[out]);
+      sout << op_name << " -> " << var_name << std::endl;
+    }
+  }
+
+  sout << "}\n";
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf20e7164a100718c1dcfe3ef971cfff60bbbaa2
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -0,0 +1,59 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/details/ssa_graph.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/place.h"
+
+#include <memory>
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class SSAGraphBuilder {
+ public:
+  SSAGraphBuilder() {}
+  virtual ~SSAGraphBuilder() {}
+  virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
+
+  DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
+
+ protected:
+  /**
+   * We only handle write after read(WAR), since it should not have a write
+   * after write in program. If there are write after write operators, we need
+   * prune them.
+   *
+   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
+   */
+  static void PolishGraphToSupportDataHazards(SSAGraph *graph);
+
+  static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph,
+                                               const std::string &each_var_name,
+                                               const platform::Place &place,
+                                               size_t place_offset);
+
+  static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
+                             const std::string &each_var_name,
+                             const platform::Place &place, size_t place_offset);
+
+  static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8da6ca889b89999e0f6f974503cea476c9de97f3
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -0,0 +1,28 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+SSAGraphExecutor::SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph)
+    : graph_(std::move(graph)) {}
+
+SSAGraphExecutor::~SSAGraphExecutor() {}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b818b1a45b56351e34f9e52ec22b6d02a0c1591
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -0,0 +1,41 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/fluid/framework/details/ssa_graph.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class SSAGraphExecutor {
+  DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
+
+ public:
+  // Steal graph inside
+  explicit SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph);
+
+  virtual ~SSAGraphExecutor();
+
+  virtual FeedFetchList Run(const std::vector<std::string> &fetch_tensors) = 0;
+
+ protected:
+  std::unique_ptr<SSAGraph> graph_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f96b9dc6235a18f7566c98cca60baa964e6aa56
--- /dev/null
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -0,0 +1,243 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+
+#include "paddle/fluid/framework/details/fetch_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
+    size_t num_threads, bool use_event,
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    std::unique_ptr<SSAGraph> &&graph, bool allow_op_delay)
+    : SSAGraphExecutor(std::move(graph)),
+      pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr),
+      local_scopes_(local_scopes),
+      places_(places),
+      fetch_ctxs_(places),
+      use_event_(use_event),
+      running_ops_(0),
+      allow_op_delay_(allow_op_delay) {}
+
+void ThreadedSSAGraphExecutor::RunDelayedOps(
+    const std::unordered_set<OpHandleBase *> &delayed_ops) {
+  for (auto op : delayed_ops) {
+    op->Run(use_event_);
+  }
+}
+
+FeedFetchList ThreadedSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  std::unordered_map<OpHandleBase *, size_t> pending_ops;
+  std::unordered_set<VarHandleBase *> pending_vars;
+  BlockingQueue<VarHandleBase *> ready_vars;
+  std::unordered_set<OpHandleBase *> ready_ops;
+  // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
+  // streams from multiple GPUs, it's faster to buffer them and schedule
+  // together since we currently cannot overlap computation and memcpy streams.
+  // Should revisit it if overlapping is available.
+  std::unordered_set<OpHandleBase *> delayed_ops;
+  std::unordered_set<OpHandleBase *> blocked_by_delayed_ops;
+  std::unordered_set<VarHandleBase *> delayed_vars;
+
+  auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
+    pending_vars.insert(&var);
+    if (var.generated_op_ == nullptr) {
+      ready_vars.Push(&var);
+    }
+  };
+
+  auto InsertPendingOp = [&pending_ops](OpHandleBase &op_instance) {
+    pending_ops.insert({&op_instance, op_instance.inputs_.size()});
+  };
+
+  // Transform SSAGraph to pending_ops & pending_vars
+  for (auto &var_map : graph_->vars_) {
+    for (auto &name_pair : var_map) {
+      for (auto &version_pair : name_pair.second) {
+        InsertPendingVar(version_pair.second);
+      }
+    }
+  }
+  for (auto &var : graph_->dep_vars_) {
+    InsertPendingVar(*var);
+  }
+
+  for (auto &op : graph_->ops_) {
+    if (op->inputs_.empty()) {  // Special case, Op has no input.
+      ready_ops.insert(op.get());
+    } else {
+      InsertPendingOp(*op);
+    }
+  }
+
+  // Step 2. Insert FetchOps
+  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
+  std::vector<DummyVarHandle> dummy_vars;
+  FeedFetchList fetch_data(fetch_tensors.size());
+
+  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
+
+  for (auto &fetch_var_name : fetch_tensors) {
+    for (auto &var_map : graph_->vars_) {
+      auto it = var_map.find(fetch_var_name);
+      if (it != var_map.end()) {
+        fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second);
+      }
+    }
+  }
+
+  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
+    auto &var_name = fetch_tensors[i];
+    auto &vars = fetched_vars.at(var_name);
+    auto *op = new FetchOpHandle(&fetch_data, i, &local_scopes_);
+    fetch_ops.emplace_back(op);
+
+    // FIXME: Use new device context
+    for (auto &p : places_) {
+      op->dev_ctxes_[p] = fetch_ctxs_.Get(p);
+    }
+
+    for (auto *var : vars) {
+      op->AddInput(var);
+    }
+    InsertPendingOp(*op);
+  }
+
+  auto run_all_ready_ops = [&] {
+    for (auto *op : ready_ops) {
+      if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
+        delayed_ops.insert(op);
+        delayed_vars.insert(op->outputs_.begin(), op->outputs_.end());
+        ready_vars.Extend(op->outputs_);
+        continue;
+      }
+      running_ops_++;
+      RunOp(&ready_vars, op);
+    }
+    ready_ops.clear();
+  };
+
+  // Create local scopes.
+  for (auto &scope : local_scopes_) {
+    auto &local_scope = scope->NewScope();
+    *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>() = &local_scope;
+  }
+
+  // Step 3. Execution
+  while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
+    // 1. Run All Ready ops
+    run_all_ready_ops();
+
+    // 2. Find ready variable
+    bool timeout;
+    auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
+
+    if (timeout) {
+      if (exception_) {
+        throw * exception_;
+      } else {
+        continue;
+      }
+    }
+    // 3. Remove the dependency of ready_var.
+    // Find the ready_ops after the ready_var.
+    for (auto ready_var : cur_ready_vars) {
+      pending_vars.erase(ready_var);
+      for (auto *op : ready_var->pending_ops_) {
+        auto &deps = pending_ops[op];
+        --deps;
+        if (deps == 0) {
+          if (delayed_vars.find(ready_var) != delayed_vars.end()) {
+            blocked_by_delayed_ops.insert(op);
+          } else {
+            ready_ops.insert(op);
+          }
+        }
+      }
+    }
+    // When there are no other ops to schedule, schedule buffered delayed
+    // ops and unblock other ops.
+    if (ready_ops.empty() && !delayed_ops.empty() && running_ops_ == 0) {
+      RunDelayedOps(delayed_ops);
+      delayed_ops.clear();
+      for (auto *op : blocked_by_delayed_ops) {
+        ready_ops.insert(op);
+      }
+      blocked_by_delayed_ops.clear();
+    }
+    // Keep loop until all vars are ready.
+  }
+  PADDLE_ENFORCE(ready_ops.empty());
+  PADDLE_ENFORCE(delayed_ops.empty());
+  PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
+  ++computation_count_;
+
+  auto sync_computation = [&] {
+    computation_count_ = 0;
+    // Wait All computational streams
+    for (auto p : this->places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
+    for (auto &scope : local_scopes_) {
+      scope->DropKids();
+    }
+  };
+
+  // Wait FetchOps.
+  if (!fetch_ops.empty()) {
+    fetch_ops.clear();
+    sync_computation();
+  }
+
+  if (computation_count_ == max_async_computation) {
+    sync_computation();
+  }
+
+  // NOTE: the temp scope can be dropped lazily if needed.
+  // Drop tmp scopes;
+  for (auto &scope : local_scopes_) {
+    auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>();
+    kid = nullptr;
+  }
+
+  return fetch_data;
+}
+
+void ThreadedSSAGraphExecutor::RunOp(
+    BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
+  auto op_run = [ready_var_q, op, this] {
+    try {
+      VLOG(10) << op->Name() << " : " << op->DebugString();
+      op->Run(use_event_);
+      running_ops_--;
+      ready_var_q->Extend(op->outputs_);
+    } catch (platform::EnforceNotMet ex) {
+      exception_.reset(new platform::EnforceNotMet(ex));
+    } catch (...) {
+      LOG(FATAL) << "Unknown exception catched";
+    }
+  };
+  if (pool_) {
+    pool_->enqueue(op_run);
+  } else {
+    op_run();
+  }
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..79cfc26b461a39811a9a125e5aeac3492d967386
--- /dev/null
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -0,0 +1,109 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <deque>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <functional>
+#include "ThreadPool.h"  // ThreadPool in thrird party
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace details {
+
+template <typename T>
+class BlockingQueue {
+ public:
+  void Push(const T &item) {
+    {
+      std::lock_guard<std::mutex> g(mutex_);
+      q_.emplace_back(item);
+    }
+    cv_.notify_one();
+  }
+
+  template <typename U>
+  void Extend(const U &items) {
+    {
+      std::lock_guard<std::mutex> g(mutex_);
+      for (auto &item : items) {
+        q_.emplace_back(item);
+      }
+    }
+    cv_.notify_all();
+  }
+
+  std::deque<T> PopAll(size_t ms, bool *timeout) {
+    auto time =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(ms);
+    std::unique_lock<std::mutex> lock(mutex_);
+    *timeout = !cv_.wait_until(lock, time, [this] { return !q_.empty(); });
+    std::deque<T> ret;
+    if (!*timeout) {
+      std::swap(ret, q_);
+    }
+    return ret;
+  }
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  std::deque<T> q_;
+};
+
+class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  ThreadedSSAGraphExecutor(size_t num_threads, bool use_event,
+                           const std::vector<Scope *> &local_scopes,
+                           const std::vector<platform::Place> &places,
+                           std::unique_ptr<SSAGraph> &&graph,
+                           bool allow_op_delay);
+
+  // Run a SSAGraph by a thread pool
+  // Use topological sort algorithm
+  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
+
+  ~ThreadedSSAGraphExecutor() {}
+
+ private:
+  void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q,
+             details::OpHandleBase *op);
+
+  void RunDelayedOps(const std::unordered_set<OpHandleBase *> &delayed_ops);
+
+ private:
+  std::unique_ptr<::ThreadPool> pool_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+  platform::DeviceContextPool fetch_ctxs_;
+  const bool use_event_;
+  std::unique_ptr<platform::EnforceNotMet> exception_;
+  std::atomic<int> running_ops_;
+  bool allow_op_delay_;
+
+  size_t computation_count_{0};
+  size_t max_async_computation{100};
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6f00abd9473a84a77ed1a39015e2ae079e00be79
--- /dev/null
+++ b/paddle/fluid/framework/details/var_handle.cc
@@ -0,0 +1,32 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/var_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+VarHandleBase::~VarHandleBase() {}
+
+std::string VarHandle::DebugString() const {
+  std::stringstream ss;
+  ss << name_ << ":" << place_;
+  return ss.str();
+}
+
+std::string DummyVarHandle::DebugString() const { return "dummy"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..569dda17c6e91d5658c4f8b9ba0b8c8fbd966832
--- /dev/null
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -0,0 +1,64 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <sstream>
+#include <string>
+#include <unordered_set>
+
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+class OpHandleBase;
+
+// VarHandleBase is the var node in the dependency graph.
+// A variable can only be generated by a single operator. i.e.
+// This is a single assignment graph.
+struct VarHandleBase {
+  virtual ~VarHandleBase();
+  virtual std::string DebugString() const = 0;
+
+  // The operator who generate this variable. nullptr if the variable
+  // is a root node.
+  OpHandleBase *generated_op_;
+
+  // Operators which depend on this variable ready.
+  std::unordered_set<OpHandleBase *> pending_ops_;
+};
+
+// VarHandle is actually a single version of Runtime Variable.
+// Variable in Runtime mapped to many VarHandles in Graph.
+// Each assignment will generate a new var handle with newer version.
+//
+// NOTE: runtime variables have place.
+struct VarHandle : public VarHandleBase {
+  std::string DebugString() const override;
+
+  // version field currently is not used, however, just store the version to
+  // debug easily.
+  size_t version_;
+  std::string name_;
+  platform::Place place_;
+};
+
+// Dummy Variable. It is used to represent dependencies between operators
+struct DummyVarHandle : public VarHandleBase {
+  std::string DebugString() const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 0b171e1dcfa90c3ad8f5a9ace8a9342baaf76e61..64c06687b6b905186d4efcc8441d3abef6323d53 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -46,7 +46,7 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
-static void CreateTensor(Variable* var, proto::VarType::Type var_type) {
+void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   if (var_type == proto::VarType::LOD_TENSOR) {
     var->GetMutable<LoDTensor>();
   } else if (var_type == proto::VarType::SELECTED_ROWS) {
@@ -294,12 +294,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
         if (var->Persistable()) {
           auto* ptr = scope->Var(var->Name());
-          CreateTensor(ptr, var->GetType());
+          InitializeVariable(ptr, var->GetType());
           VLOG(3) << "Create Variable " << var->Name()
                   << " global, which pointer is " << ptr;
         } else {
           auto* ptr = local_scope->Var(var->Name());
-          CreateTensor(ptr, var->GetType());
+          InitializeVariable(ptr, var->GetType());
           VLOG(3) << "Create Variable " << var->Name()
                   << " locally, which pointer is " << ptr;
         }
@@ -307,7 +307,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
     } else {
       for (auto& var : block.AllVars()) {
         auto* ptr = local_scope->Var(var->Name());
-        CreateTensor(ptr, var->GetType());
+        InitializeVariable(ptr, var->GetType());
         VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
                 << ptr;
       }
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index d8dd82469af06a4c5c6a37d2249ee23413884a91..7173c51c95e04ad3095f01bb24923a7a3341c517 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -22,6 +22,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
 
 struct ExecutorPrepareContext {
   ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b39a1164dbd9877d9f45cc6415d74f930921a42f..f6a43804ef2fd73c4a2c2c3b3dfbb90bff1c451b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -517,6 +517,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // do data transform
   Scope& new_scope = scope.NewScope();
 
+  std::vector<std::string> inplace_vars;
   for (auto& var_name_item : this->Inputs()) {
     for (auto& var_name : var_name_item.second) {
       auto* var = scope.FindVar(var_name);
@@ -529,10 +530,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
             auto out_var_names = OutputVars(true);
             if (std::find(out_var_names.begin(), out_var_names.end(),
                           var_name) != out_var_names.end()) {
-              PADDLE_THROW(
-                  "var %s is both input and output, "
-                  "does not support transform",
-                  var_name);
+              inplace_vars.push_back(var_name);
             }
             VLOG(3) << "Transform Variable " << var_name << " from "
                     << kernel_type_for_var << " to " << expected_kernel_key;
@@ -551,6 +549,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   kernel_iter->second->Compute(
       ExecutionContext(*this, new_scope, *new_dev_ctx));
 
+  for (auto& var_name : inplace_vars) {
+    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
+    auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
+    auto* transformed_tensor = GetTensorFromVar(new_scope.FindVar(var_name));
+    original_tensor->ShareDataWith(*transformed_tensor);
+  }
+
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     new_dev_ctx->Wait();
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17885143247f0e0db8f12931e3c3412e7114ef3d
--- /dev/null
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -0,0 +1,162 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/platform/profiler.h"
+
+#include <string>
+#include <vector>
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+
+class ParallelExecutorPrivate {
+ public:
+  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
+      : places_(places) {}
+
+  std::vector<platform::Place> places_;
+  std::vector<Scope *> local_scopes_;
+  Scope *global_scope_;
+  std::unique_ptr<details::SSAGraphExecutor> executor_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
+#endif
+};
+
+ParallelExecutor::ParallelExecutor(
+    size_t num_threads, bool use_event,
+    const std::vector<platform::Place> &places,
+    const std::unordered_set<std::string> &params,
+    const ProgramDesc &startup_program, const ProgramDesc &main_program,
+    const std::string &loss_var_name, Scope *scope, bool allow_op_delay)
+    : member_(new ParallelExecutorPrivate(places)) {
+  member_->global_scope_ = scope;
+
+  // Step 1. RunStartupProgram and Bcast the params to devs.
+  Executor exe(places[0]);
+  exe.Run(startup_program, scope, 0);
+  // Create local scopes
+  for (size_t i = 0; i < member_->places_.size(); ++i) {
+    member_->local_scopes_.push_back(&scope->NewScope());
+  }
+
+// Bcast Parameters to all GPUs
+#ifdef PADDLE_WITH_CUDA
+  member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
+#endif
+  if (platform::is_gpu_place(places[0]) &&
+      member_->local_scopes_.size() != 1) {  // Is CUDA
+    BCastParamsToGPUs(startup_program);
+  }
+// Startup Program has been run. All local scopes has correct parameters.
+
+// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
+// ncclOp
+#ifdef PADDLE_WITH_CUDA
+  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
+                                           params, member_->local_scopes_,
+                                           member_->nccl_ctxs_.get());
+#else
+  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
+                                           params, member_->local_scopes_);
+#endif
+  auto graph = builder.Build(main_program);
+
+  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+      num_threads, use_event, member_->local_scopes_, places, std::move(graph),
+      allow_op_delay));
+
+  // Step 3. Create vars in each scope;
+  for (auto *scope : member_->local_scopes_) {
+    for (auto *var : main_program.Block(0).AllVars()) {
+      if (scope->FindVar(var->Name()) != nullptr) {
+        continue;
+      }
+
+      InitializeVariable(scope->Var(var->Name()), var->GetType());
+    }
+  }
+}
+
+void ParallelExecutor::BCastParamsToGPUs(
+    const ProgramDesc &startup_program) const {
+#ifdef PADDLE_WITH_CUDA
+  auto *main_scope = member_->local_scopes_[0];
+
+  for (auto *var_desc : startup_program.Block(0).AllVars()) {
+    size_t idx = var_desc->Name().find("@GRAD");
+    if (idx != std::string::npos) continue;
+    if (var_desc->GetType() == proto::VarType::LOD_TENSOR) {
+      auto &main_tensor =
+          main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
+
+      auto &dims = main_tensor.dims();
+
+      if (paddle::platform::is_gpu_place(main_tensor.place())) {
+        size_t numel = main_tensor.numel();
+        ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
+        platform::NCCLGroupGuard guard;
+        for (size_t i = 0; i < member_->places_.size(); ++i) {
+          auto place = member_->places_[i];
+          void *buffer;
+          if (i == 0) {
+            buffer = const_cast<void *>(main_tensor.data<void>());
+          } else {
+            auto local_scope = member_->local_scopes_[i];
+            auto *t =
+                local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
+            t->Resize(dims);
+            buffer = t->mutable_data(place, main_tensor.type());
+          }
+          auto &nccl_ctx = member_->nccl_ctxs_->at(place);
+          platform::dynload::ncclBcast(buffer, numel, data_type, 0,
+                                       nccl_ctx.comm_, nccl_ctx.stream());
+        }
+      } else {
+        platform::CPUPlace cpu;
+        for (size_t i = 1; i < member_->places_.size(); ++i) {
+          auto local_scope = member_->local_scopes_[i];
+          auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
+          t->Resize(dims);
+          t->mutable_data(cpu, main_tensor.type());
+          paddle::framework::TensorCopy(main_tensor, cpu, t);
+        }
+      }
+    }
+    member_->nccl_ctxs_->WaitAll();
+  }
+#else
+  PADDLE_THROW("Not compiled with CUDA");
+#endif
+}
+
+void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
+                           const std::string &fetched_var_name) {
+  platform::RecordBlock b(0);
+  auto fetch_data = member_->executor_->Run(fetch_tensors);
+  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
+      fetch_data;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..964b476234e622cae934d41bc3793bc3114a5f1a
--- /dev/null
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+class ParallelExecutorPrivate;
+
+class ParallelExecutor {
+  DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
+
+ public:
+  explicit ParallelExecutor(size_t num_threads, bool use_event,
+                            const std::vector<platform::Place>& places,
+                            const std::unordered_set<std::string>& params,
+                            const ProgramDesc& startup_program,
+                            const ProgramDesc& main_program,
+                            const std::string& loss_var_name, Scope* scope,
+                            bool allow_op_delay);
+
+  void Run(const std::vector<std::string>& fetch_tensors,
+           const std::string& fetched_var_name = "fetched_var");
+
+ private:
+  ParallelExecutorPrivate* member_;
+
+  void BCastParamsToGPUs(const ProgramDesc& startup_program) const;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index fa00c08e0d5791ee1187aed38b4d140564b7c97d..56bf00e5f91700f0cffa917aad8608caaab0a7fe 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -29,7 +29,7 @@ void FileReader::ReadNext(std::vector<LoDTensor> *out) {
 
     PADDLE_ENFORCE_EQ(actual.size(), expect.size());
     for (int j = 0; j < actual.size(); ++j) {
-      PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1);
+      //      PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1);
     }
   }
 }
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index c9c2c1bb721f2c527fa52f45cc54883f639f4ef8..9458d56a01df432aea573d796456b9be31350038 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -10,6 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 
@@ -52,7 +55,7 @@ class SelectedRows {
 
  private:
   // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
-  // SelectedRows are simplely concated when adding together. Until a
+  // SelectedRows are simply concated when adding together. Until a
   // SelectedRows add a Tensor, will the duplicate rows be handled.
   Vector<int64_t> rows_;
   std::unique_ptr<Tensor> value_{nullptr};
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index df51fb24a588c84788d7d0b671f932ff4c40f9c2..f9dce7105e32ff0ba03d03f8faaac3a4ed1a3595 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -32,6 +32,8 @@ namespace framework {
 // number of threads.
 class ThreadPool {
  public:
+  explicit ThreadPool(int num_threads);
+
   using Task = std::packaged_task<std::unique_ptr<platform::EnforceNotMet>()>;
 
   // Returns the singleton of ThreadPool.
@@ -103,8 +105,6 @@ class ThreadPool {
 
   DISABLE_COPY_AND_ASSIGN(ThreadPool);
 
-  explicit ThreadPool(int num_threads);
-
   // If the task queue is empty and avaialbe is equal to the number of
   // threads, means that all tasks are completed.  Note: this function
   // is not thread-safe.  Returns true if all tasks are completed.
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index bb6862990aaeda844dac7b60eea6678ec7f91ebf..a45f8c33ee5956f3409ee1b7c43628aa0acafb98 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -80,7 +80,18 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // if size is 0.  We just make sure it does.
   if (size <= 0) return nullptr;
   void* p;
+  int prev_id;
+  cudaGetDevice(&prev_id);
+  if (prev_id != gpu_id_) {
+    cudaSetDevice(gpu_id_);
+  }
+
   cudaError_t result = cudaMalloc(&p, size);
+
+  if (prev_id != gpu_id_) {
+    cudaSetDevice(prev_id);
+  }
+
   if (result == cudaSuccess) {
     index = 0;
     gpu_alloc_size_ += size;
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
index d5df9e6897e9e788f14d2625e424c13949eeaa26..3e1926f632c57b7906e4a76f43ff7a753d71d97f 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -58,7 +58,7 @@ TEST(CPUAllocator, LockMem) {
 
 #ifdef PADDLE_WITH_CUDA
 TEST(GPUAllocator, Alloc) {
-  paddle::memory::detail::GPUAllocator a;
+  paddle::memory::detail::GPUAllocator a(0);
   TestAllocator(a, 2048);
   TestAllocator(a, 0);
 }
diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc
index dc6c362728b86367c4dd6eb881d8647ae71a9cff..09f82166beab369416e351dbb8ecd09f759bfbda 100644
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -69,7 +69,7 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   }
   platform::SetDeviceId(gpu_id);
   if (!as[gpu_id]) {
-    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator,
+    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator(gpu_id),
                                     platform::GpuMinChunkSize(),
                                     platform::GpuMaxChunkSize());
     VLOG(10) << "\n\nNOTE: each GPU device use "
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 9a11e1be7050adb1803b1fd835ffb811d9cae4cd..9ed79453b962b8702a88cea888a860cd5d8d64d1 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -183,6 +183,8 @@ if(WITH_DISTRIBUTE)
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
     op_library(send_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
@@ -191,9 +193,9 @@ if(WITH_DISTRIBUTE)
     set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
+    set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
 endif()
 
 op_library(cond_op DEPS framework_proto tensor net_op)
@@ -264,3 +266,4 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memor
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 979115eee0dbe157dbcf2293d914cc250b35d22e..a6d9ce0f041b859ecf6b3de902a9d1f132a4c76e 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -260,6 +260,36 @@ $out = floor(x)$
   }
 };
 
+class CosOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CosOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Cosine operator");
+    AddOutput("Out", "Output of Cosine operator");
+    AddComment(R"DOC(
+Cosine Activation Operator.
+
+$out = cos(x)$
+
+)DOC");
+  }
+};
+
+class SinOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Sine operator");
+    AddOutput("Out", "Output of Sine operator");
+    AddComment(R"DOC(
+Sine Activation Operator.
+
+$out = sin(x)$
+
+)DOC");
+  }
+};
+
 class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
@@ -561,6 +591,12 @@ REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
 REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
             ops::ActivationOpGrad);
 
+REGISTER_OP(cos, ops::ActivationOp, ops::CosOpMaker, cos_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(sin, ops::ActivationOp, ops::SinOpMaker, sin_grad,
+            ops::ActivationOpGrad);
+
 REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
             ops::ActivationOpGrad);
 
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 4c575b4a7b551be2d1288f7fec0a2821fc10c40d..7fbe4efc045b6539b498389af94769e5bdb1f82e 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -331,6 +331,54 @@ struct FloorFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct Sine {
+  HOSTDEVICE T operator()(const T& val) const { return sin(val); }
+};
+
+template <typename T>
+struct Cosine {
+  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
+};
+
+// cosine'(x) = -sin(x)
+template <typename T>
+struct CosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = -dout * x.unaryExpr(Sine<T>());
+  }
+};
+
+// cosine(x) = cos(x)
+template <typename T>
+struct CosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Cosine<T>());
+  }
+};
+
+// sine'(x) = cos(x)
+template <typename T>
+struct SinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Cosine<T>());
+  }
+};
+
+// sine(x) = sin(x)
+template <typename T>
+struct SinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Sine<T>());
+  }
+};
+
 // round(x) = [x]
 template <typename T>
 struct RoundFunctor : public BaseActivationFunctor<T> {
@@ -782,6 +830,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   __macro(abs, AbsFunctor, AbsGradFunctor);                          \
   __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
   __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
+  __macro(cos, CosFunctor, CosGradFunctor);                          \
+  __macro(sin, SinFunctor, SinGradFunctor);                          \
   __macro(round, RoundFunctor, ZeroGradFunctor);                     \
   __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
   __macro(log, LogFunctor, LogGradFunctor);                          \
diff --git a/paddle/fluid/operators/channel_send_op.cc b/paddle/fluid/operators/channel_send_op.cc
index 47cf7d7efc9996e8a8db11b79c0310f77c2435a4..66d33617ede5bef8a95de14f5b447c0910fe3eb4 100644
--- a/paddle/fluid/operators/channel_send_op.cc
+++ b/paddle/fluid/operators/channel_send_op.cc
@@ -23,21 +23,10 @@ limitations under the License. */
 
 static constexpr char Channel[] = "Channel";
 static constexpr char X[] = "X";
-static constexpr char Status[] = "Status";
-static constexpr char copy[] = "copy";
 
 namespace paddle {
 namespace operators {
 
-void SetSendStatus(const platform::Place &dev_place,
-                   framework::Variable &status_var, bool status) {
-  auto cpu = platform::CPUPlace();
-  auto status_tensor =
-      status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
-                                                                        cpu);
-  status_tensor[0] = status;
-}
-
 class ChannelSendOp : public framework::OperatorBase {
  public:
   ChannelSendOp(const std::string &type,
@@ -51,9 +40,6 @@ class ChannelSendOp : public framework::OperatorBase {
                    "Input(Channel) of ChannelSendOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(X),
                    "Input(X) of ChannelSendOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(Status),
-                   "Output(Status) of ChannelSendOp should not be null.");
-    ctx->SetOutputDim("Status", {1});
   }
 
  private:
@@ -65,10 +51,7 @@ class ChannelSendOp : public framework::OperatorBase {
     auto input_var = scope.FindVar(Input(X));
 
     // Send the input data through the channel.
-    bool ok = concurrency::ChannelSend(ch, input_var);
-
-    // Set the status output of the `ChannelSend` call.
-    SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+    concurrency::ChannelSend(ch, input_var);
   }
 };
 
@@ -82,12 +65,6 @@ class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDuplicable();
     AddInput(X, "(Variable) The value which gets sent by the channel.")
         .AsDuplicable();
-    AddOutput(Status,
-              "(Tensor) An LoD Tensor that returns a boolean status of the"
-              "result of the send operation.")
-        .AsDuplicable();
-    AddAttr<bool>(copy, "(bool, default false) Should copy before send")
-        .SetDefault(false);
     AddComment(R"DOC(
 )DOC");
   }
diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc
index 86f7046058c7001fcaa588727b1cdc0f3f20c35f..9a139ab27ec53395a8d1ab1347dbce93ea68fd8e 100644
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -29,6 +29,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Y", string::Sprintf(
                       "(LoDTensor) the right hand operand of %s operator",
                       comment.type));
+    AddAttr<bool>("force_cpu",
+                  "(bool, default false) Force fill output variable to cpu "
+                  "memory. Otherwise, fill output variable to the running "
+                  "device")
+        .SetDefault(false);
     AddOutput("Out", string::Sprintf(
                          "(LoDTensor) n-dim bool tensor. Each element is %s",
                          comment.equation));
@@ -75,7 +80,9 @@ class CompareOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // CompareOp kernel's device type is decided by input tensor place
-    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    bool force_cpu = ctx.Attr<bool>("force_cpu");
+    kt.place_ = force_cpu ? platform::CPUPlace()
+                          : ctx.Input<framework::LoDTensor>("X")->place();
     return kt;
   }
 };
diff --git a/paddle/fluid/operators/concurrency/channel_util.cc b/paddle/fluid/operators/concurrency/channel_util.cc
index a483af7affd824da7d18676d934dc959167ef71f..246c99489c45efec16babb1d3980606318236605 100644
--- a/paddle/fluid/operators/concurrency/channel_util.cc
+++ b/paddle/fluid/operators/concurrency/channel_util.cc
@@ -17,20 +17,20 @@ limitations under the License. */
 
 namespace poc = paddle::operators::concurrency;
 
-bool poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
+void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
   auto type = framework::ToVarType(var->Type());
   if (type == framework::proto::VarType_Type_LOD_TENSOR)
-    return ch->Send(var->GetMutable<framework::LoDTensor>());
+    ch->Send(var->GetMutable<framework::LoDTensor>());
   else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
-    return ch->Send(var->GetMutable<framework::LoDRankTable>());
+    ch->Send(var->GetMutable<framework::LoDRankTable>());
   else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
-    return ch->Send(var->GetMutable<framework::LoDTensorArray>());
+    ch->Send(var->GetMutable<framework::LoDTensorArray>());
   else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
-    return ch->Send(var->GetMutable<framework::SelectedRows>());
+    ch->Send(var->GetMutable<framework::SelectedRows>());
   else if (type == framework::proto::VarType_Type_READER)
-    return ch->Send(var->GetMutable<framework::ReaderHolder>());
+    ch->Send(var->GetMutable<framework::ReaderHolder>());
   else if (type == framework::proto::VarType_Type_CHANNEL)
-    return ch->Send(var->GetMutable<framework::ChannelHolder>());
+    ch->Send(var->GetMutable<framework::ChannelHolder>());
   else
     PADDLE_THROW("ChannelSend:Unsupported type");
 }
diff --git a/paddle/fluid/operators/concurrency/channel_util.h b/paddle/fluid/operators/concurrency/channel_util.h
index c3674bd9815df451751707bfa84d18dbb5fa0f6b..cd18ca78c6fdecdc6c72748611ccdd9c2690ef46 100644
--- a/paddle/fluid/operators/concurrency/channel_util.h
+++ b/paddle/fluid/operators/concurrency/channel_util.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 namespace concurrency {
 
-bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
+void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
 bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var);
 
 void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 337b34e8f0bf4cb89753235205be9eb058dd01ab..bff2c34ec893d0e6212426b108dd98b0d0d0fb48 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -54,7 +54,18 @@ class ConditionalOp : public framework::OperatorBase {
           "numel should be 1, actual numel is %d",
           ips[0]->numel());
     }
-    return ips[0]->data<bool>()[0];
+    bool res = false;
+    if (platform::is_gpu_place(ips[0]->place())) {
+#ifdef PADDLE_WITH_CUDA
+      framework::LoDTensor cpu_tensor;
+      framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
+      platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
+      res = cpu_tensor.data<bool>()[0];
+#endif
+    } else {
+      res = ips[0]->data<bool>()[0];
+    }
+    return res;
   }
 };
 
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index 2b19f0448955d2d7582f23ac133c14ffdf5c9e49..3adeeda90645ca983d9d9229b4cc1c4c90302206 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -2,7 +2,8 @@ if(WITH_DISTRIBUTE)
   grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
       grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_test(serde_test SRCS test_serde.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
+  set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
       cares zlib protobuf sendrecvop_grpc)
+  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
 endif()
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index e73bbe7537a9b37d358a5aa4a076032b57fca513..d79ba6d291950e1f089eb11713bd1c3e4d154b27 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "grpc_client.h"
-#include <sys/time.h>
+#include "paddle/fluid/operators/detail/grpc_client.h"
+
+#include <limits>
+
 #include "paddle/fluid/framework/threadpool.h"
 
 namespace paddle {
@@ -52,7 +54,7 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
     call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
   });
 
   req_count_++;
@@ -70,8 +72,7 @@ void ProcGetResponse(const VarHandle& var_h,
 template <typename T>
 void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
   ::grpc::Slice slice(proto.ByteSizeLong());
-  proto.SerializeWithCachedSizesToArray(
-      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(slice.begin())));
+  proto.SerializeWithCachedSizesToArray(const_cast<uint8_t*>(slice.begin()));
   ::grpc::ByteBuffer tmp(&slice, 1);
   result->Swap(&tmp);
 }
@@ -88,10 +89,13 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
   const auto ch = GetChannel(ep_val);
 
   framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
+    // prepare input
     sendrecv::VariableMessage req;
     req.set_varname(var_name_val);
+    ::grpc::ByteBuffer buf;
+    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
 
-    // varhandle
+    // var handle
     VarHandle var_h;
     var_h.ep = ep_val;
     var_h.scope = p_scope;
@@ -103,13 +107,10 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
     s->Prepare(var_h, time_out);
     s->response_call_back_ = ProcGetResponse;
 
-    ::grpc::ByteBuffer buf;
-    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
-
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
     call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
   });
 
   req_count_++;
@@ -117,6 +118,49 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
   return true;
 }
 
+bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
+                                      const platform::DeviceContext& ctx,
+                                      const framework::Scope& scope,
+                                      const std::string& in_var_name,
+                                      const std::string& out_var_name,
+                                      int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string in_var_name_val = in_var_name;
+  const std::string out_var_name_val = out_var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
+                    time_out, ch, this] {
+    auto* var = p_scope->FindVar(in_var_name_val);
+
+    ::grpc::ByteBuffer req;
+    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req);
+
+    // var handle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = out_var_name_val;
+    var_h.ctx = p_ctx;
+
+    // stub context
+    GetProcessor* s = new GetProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = ProcGetResponse;
+
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
+        &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
+  });
+
+  req_count_++;
+  return true;
+}
+
 void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
   const auto ch = GetChannel(ep);
 
@@ -126,7 +170,7 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
   sendrecv::VariableMessage req;
   req.set_varname(BATCH_BARRIER_MESSAGE);
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
   req_count_++;
 }
 
@@ -138,7 +182,7 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
   sendrecv::VariableMessage req;
   req.set_varname(FETCH_BARRIER_MESSAGE);
   auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
   req_count_++;
 }
 
@@ -204,7 +248,6 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
   }
 
   grpc::ChannelArguments args;
-  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 5000);
   args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
   args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
index 8216ac52fbbb3dcd2f30957cde58a850a77b08d6..fe237e54ef61fb5b6e9bfa46fbe6b3df3dd40265 100644
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -172,6 +172,13 @@ class RPCClient {
                         const std::string& var_name,
                         int64_t time_out = 600 * 1000);
 
+  bool AsyncPrefetchVariable(const std::string& ep,
+                             const platform::DeviceContext& ctx,
+                             const framework::Scope& scope,
+                             const std::string& in_var_name,
+                             const std::string& out_var_name,
+                             int64_t time_out = 600 * 1000);
+
   void AsyncSendBatchBarrier(const std::string& ep,
                              int64_t time_out = 600 * 1000);
 
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 9691d1e86b111def5b82e022dd01795aaf5c7b0d..7c978b28b6873d05afb435de4caf7f4ce5d33193 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/detail/grpc_server.h"
 
+#include <limits>
+#include <string>
+
 using ::grpc::ServerAsyncResponseWriter;
 
 namespace paddle {
@@ -128,6 +131,49 @@ class RequestGet final : public RequestBase {
   SimpleBlockQueue<MessageWithName>* queue_;
 };
 
+class RequestPrefetch final : public RequestBase {
+ public:
+  explicit RequestPrefetch(GrpcService::AsyncService* service,
+                           ::grpc::ServerCompletionQueue* cq,
+                           framework::Scope* scope,
+                           const platform::DeviceContext* dev_ctx,
+                           framework::Executor* executor,
+                           framework::ProgramDesc* program, int blkid)
+      : RequestBase(service, cq, dev_ctx),
+        responder_(&ctx_),
+        scope_(scope),
+        executor_(executor),
+        program_(program),
+        blkid_(blkid) {
+    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
+    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
+                                cq_, this);
+  }
+
+  virtual ~RequestPrefetch() {}
+
+  virtual std::string GetReqName() { return request_.varname(); }
+
+  virtual void Process() {
+    // prefetch process...
+    ::grpc::ByteBuffer reply;
+    // TODO(Yancey1989): execute the Block which containers prefetch ops
+
+    VLOG(3) << "RequestPrefetch Process in";
+
+    responder_.Finish(reply, ::grpc::Status::OK, this);
+    status_ = FINISH;
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+  framework::Scope* scope_;
+  framework::Executor* executor_;
+  framework::ProgramDesc* program_;
+  int blkid_;
+};
+
 void AsyncGRPCServer::WaitClientGet(int count) {
   int fetch_barriers = 0;
   while (fetch_barriers < count) {
@@ -147,6 +193,7 @@ void AsyncGRPCServer::RunSyncUpdate() {
 
   cq_send_ = builder.AddCompletionQueue();
   cq_get_ = builder.AddCompletionQueue();
+  cq_prefetch_ = builder.AddCompletionQueue();
 
   server_ = builder.BuildAndStart();
   LOG(INFO) << "Server listening on " << address_ << std::endl;
@@ -155,6 +202,8 @@ void AsyncGRPCServer::RunSyncUpdate() {
       std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
   std::function<void()> get_register =
       std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
+  std::function<void()> prefetch_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
 
   t_send_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
@@ -163,29 +212,34 @@ void AsyncGRPCServer::RunSyncUpdate() {
   t_get_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
                                 cq_get_.get(), "cq_get", get_register)));
-
+  t_prefetch_.reset(new std::thread(
+      std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
+                "cq_prefetch", prefetch_register)));
   // wait server
   server_->Wait();
   t_send_->join();
   t_get_->join();
+  t_prefetch_->join();
 }
 
 void AsyncGRPCServer::ShutdownQueue() {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   cq_send_->Shutdown();
   cq_get_->Shutdown();
-  is_shut_down_ = true;
+  cq_prefetch_->Shutdown();
 }
 
 // This URL explains why shutdown is complicate:
 void AsyncGRPCServer::ShutDown() {
-  server_->Shutdown();
+  is_shut_down_ = true;
   ShutdownQueue();
+  server_->Shutdown();
 }
 
 void AsyncGRPCServer::TryToRegisterNewSendOne() {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
     return;
   }
   RequestSend* send = new RequestSend(&service_, cq_send_.get(), scope_,
@@ -196,6 +250,7 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() {
 void AsyncGRPCServer::TryToRegisterNewGetOne() {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
     return;
   }
   RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
@@ -203,6 +258,19 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
   VLOG(4) << "Create RequestGet status:" << get->Status();
 }
 
+void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
+    return;
+  }
+  RequestPrefetch* prefetch =
+      new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_,
+                          executor_, program_, prefetch_blk_id_);
+
+  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
+}
+
 // FIXME(typhoonzero): change cq_name to enum.
 void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
                                     std::string cq_name,
@@ -211,25 +279,28 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
 
   void* tag = NULL;
   bool ok = false;
+
   while (true) {
+    VLOG(3) << "HandleRequest for " << cq_name << " while in";
     if (!cq->Next(&tag, &ok)) {
-      LOG(INFO) << cq_name << " get CompletionQueue shutdown!";
+      LOG(INFO) << cq_name << " CompletionQueue shutdown!";
       break;
     }
+    VLOG(3) << "HandleRequest for " << cq_name << " while after Next";
 
     PADDLE_ENFORCE(tag);
     // FIXME(typhoonzero): de-couple the barriers with recv_op
-    if (cq_name == "cq_get") WaitCond(1);
-    if (cq_name == "cq_send") WaitCond(0);
+    if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
+    if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
 
-    RequestBase* base = (RequestBase*)tag;
+    RequestBase* base = reinterpret_cast<RequestBase*>(tag);
     // reference:
     // https://github.com/tensorflow/tensorflow/issues/5596
     // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
     // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
     if (!ok) {
-      LOG(WARNING) << cq_name << " recv no regular event:argument name"
-                   << base->GetReqName();
+      LOG(WARNING) << cq_name << " recv no regular event:argument name["
+                   << base->GetReqName() << "]";
       TryToRegisterNewOne();
       delete base;
       continue;
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 10e6dd45a901d36de4a6577db4da05551645eb73..b0596d3cd1e108f28e8f1485d6b5c989c55be7e9 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -15,9 +15,12 @@ limitations under the License. */
 #pragma once
 
 #include <grpc++/grpc++.h>
-#include <thread>
+#include <string>
+#include <utility>
 
+#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -53,6 +56,12 @@ class AsyncGRPCServer final {
 
   void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
 
+  void SetProgram(framework::ProgramDesc *program) { program_ = program; }
+
+  void SetPrefetchBlkdId(int blkid) { prefetch_blk_id_ = blkid; }
+
+  void SetExecutor(framework::Executor *executor) { executor_ = executor; }
+
   const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
 
   void Push(const std::string &msg_name) {
@@ -66,6 +75,7 @@ class AsyncGRPCServer final {
                      std::function<void()> TryToRegisterNewOne);
   void TryToRegisterNewSendOne();
   void TryToRegisterNewGetOne();
+  void TryToRegisterNewPrefetchOne();
   void ShutdownQueue();
 
  private:
@@ -73,6 +83,7 @@ class AsyncGRPCServer final {
   volatile bool is_shut_down_ = false;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_;
 
   GrpcService::AsyncService service_;
   std::unique_ptr<::grpc::Server> server_;
@@ -83,6 +94,7 @@ class AsyncGRPCServer final {
 
   // received variable from RPC, operators fetch variable from this queue.
   SimpleBlockQueue<MessageWithName> var_get_queue_;
+  // client send variable to this queue.
   ReceivedQueue var_recv_queue_;
 
   // condition of the sub program
@@ -92,6 +104,11 @@ class AsyncGRPCServer final {
 
   std::unique_ptr<std::thread> t_send_;
   std::unique_ptr<std::thread> t_get_;
+  std::unique_ptr<std::thread> t_prefetch_;
+
+  int prefetch_blk_id_;
+  framework::ProgramDesc *program_;
+  framework::Executor *executor_;
 };
 
 };  // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ad62863a1a98c28cb08f47dfa8a5bfae463ba91
--- /dev/null
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace detail = paddle::operators::detail;
+
+std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
+
+void StartServer(const std::string& endpoint) {
+  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+  rpc_service_->RunSyncUpdate();
+}
+
+TEST(PREFETCH, CPU) {
+  // start up a server instance backend
+  // TODO(Yancey1989): Need to start a server with optimize blocks and
+  // prefetch blocks.
+  std::thread server_thread(StartServer, "127.0.0.1:8889");
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  // create var on local scope
+  std::string in_var_name("in");
+  std::string out_var_name("out");
+  auto* in_var = scope.Var(in_var_name);
+  auto* in_tensor = in_var->GetMutable<framework::LoDTensor>();
+  in_tensor->Resize({10, 10});
+  VLOG(3) << "before mutable_data";
+  in_tensor->mutable_data<int>(place);
+
+  scope.Var(out_var_name);
+
+  VLOG(3) << "before fetch";
+  detail::RPCClient client;
+  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
+                               out_var_name);
+  client.Wait();
+
+  rpc_service_->ShutDown();
+  server_thread.join();
+  rpc_service_.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h
index ae6f9db3bd31a4b4839b34e8e53dd87f1ecf4b1d..e6dab2f5a3a4280f3979417c3ca2d884a0b8ff2f 100644
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -76,10 +76,11 @@ namespace detail {
 enum class GrpcMethod {
   kSendVariable,
   kGetVariable,
+  kPrefetchVariable,
 };
 
 static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kGetVariable) + 1;
+    static_cast<int>(GrpcMethod::kPrefetchVariable) + 1;
 
 inline const char* GrpcMethodName(GrpcMethod id) {
   switch (id) {
@@ -87,6 +88,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/SendVariable";
     case GrpcMethod::kGetVariable:
       return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kPrefetchVariable:
+      return "/sendrecv.SendRecvService/PrefetchVariable";
   }
 
   // Shouldn't be reached.
@@ -114,5 +117,5 @@ class GrpcService final {
 };
 
 }  // namespace detail
-}  // namespace operator
+}  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
index 598aaa4c51a6c5cd32eeffe08bbae849aee1a1df..fc12e82a7e6bd10262092d1ca367980df64e91c2 100644
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -21,6 +21,8 @@ service SendRecvService {
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
   // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  // Prefetch variable by Ids
+  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
@@ -59,12 +61,12 @@ message VariableMessage {
   // lod details:
   int64 lod_level = 5;
   repeated LodData lod = 6;
+  // selected_rows height, aka. original dim0
+  int64 slr_height = 7;
   // tensor data
-  bytes serialized = 7;
+  bytes serialized = 8;
   // selected_rows data
-  bytes rows = 8;
+  bytes rows = 9;
 }
 
 message VoidMessage {}
-
-message TestMessage { int64 test_1 = 1; }
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
index d7bbf79c50651943d91c38bbaab775f5ee8dc395..7e3f015dabdb3fd6190d1ca2f422aa526e8889cd 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -108,6 +108,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
         e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
       }
       e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0);
+      e.WriteUint64(VarMsg::kSlrHeightFieldNumber, slr->height());
       auto* tensor = slr->mutable_value();
       if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
@@ -154,7 +155,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
     ProtoEncodeHelper e2((char*)buf, 128);
     // NOTE: rows is of type int64_t
     size_t rows_memory_size =
-        slr->rows().capacity() * framework::SizeOfType(typeid(int64_t));
+        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
     e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
     slices[2] = ::grpc::Slice(e2.size());
     memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h
index 3b875627032a6b08cc70280b3cc825c2a703923f..b3b2b8469c8f19313038f2551ab04708a05656d5 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <sys/time.h>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -35,6 +36,12 @@ namespace detail {
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
 
+static int64_t GetTimestamp() {
+  struct timeval tp;
+  gettimeofday(&tp, NULL);
+  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
+}
+
 typedef void (*DestroyCallback)(void*);
 
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/serde_test.cc
similarity index 94%
rename from paddle/fluid/operators/detail/test_serde.cc
rename to paddle/fluid/operators/detail/serde_test.cc
index e646c894d18d37f5343a10df2542a0e46ab13372..ea1670e56f3c2fedc2617db1425472e52c6519f5 100644
--- a/paddle/fluid/operators/detail/test_serde.cc
+++ b/paddle/fluid/operators/detail/serde_test.cc
@@ -40,14 +40,14 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   // serialize var to ByteBuffer
   framework::Variable var;
   auto* slr = var.GetMutable<framework::SelectedRows>();
+  slr->set_height(1000);
   auto* tensor = slr->mutable_value();
   auto* rows = slr->mutable_rows();
-  tensor->Resize(framework::make_ddim({2, 10}));
+  tensor->Resize(framework::make_ddim({564, 128}));
   tensor->mutable_data<float>(place);
-  int tensor_numel = 2 * 10;
+  int tensor_numel = 564 * 128;
   math::set_constant(ctx, tensor, 32.7);
-  rows->push_back(3);
-  rows->push_back(10);
+  for (int i = 0; i < 564; ++i) rows->push_back(i);
 
   ::grpc::ByteBuffer msg;
   operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
@@ -64,6 +64,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   sendrecv::VariableMessage varmsg;
   EXPECT_TRUE(varmsg.ParseFromString(tmp));
 
+  // deserialize bytebuffer
   EXPECT_EQ(varmsg.varname(), "myvar");
   EXPECT_EQ(varmsg.type(), 1);
 
@@ -74,8 +75,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   for (int i = 0; i < tensor_numel; ++i) {
     EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
   }
-  EXPECT_EQ(rows_data[0], 3);
-  EXPECT_EQ(rows_data[1], 10);
+  for (int i = 0; i < 564; ++i) {
+    EXPECT_EQ(rows_data[i], i);
+  }
+
   // deserialize zero-copy
   // framework::Variable var2;
   // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
@@ -104,8 +107,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   for (int i = 0; i < tensor_numel; ++i) {
     EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
   }
-  EXPECT_EQ(rows_data2[0], 3);
-  EXPECT_EQ(rows_data2[1], 10);
+  for (int i = 0; i < rows2->size(); ++i) {
+    EXPECT_EQ(rows_data2[i], i);
+  }
+  EXPECT_EQ(slr2->height(), 1000);
 }
 
 void RunTestLodTensor(platform::Place place, int from_type = 0) {
diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc
index 12e8eb0b4da2252b104415aef4156bf100c3e565..f59c9b50bb36c12c9abc0a52e0d11c6a73217047 100644
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -48,6 +48,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
              void* dest, int size) {
   const void* data = NULL;
   int size_to_write = 0;
+  int length = size;
+  int total_written = 0;
 
   if (platform::is_gpu_place(place)) {
 #ifdef PADDLE_WITH_CUDA
@@ -56,16 +58,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
     platform::CPUPlace cpu;
 
     char* p = reinterpret_cast<char*>(dest);
-    while (size > 0) {
+    while (total_written < length) {
       if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
         return false;
       }
-
+      // NOTE: if raw buffer is large and have two neighbor fields of raw
+      // buffers GetDirectBufferPointer can get all of them, use length to
+      // truncate it.
+      if (total_written + size_to_write > length) {
+        size_to_write = length - total_written;
+      }
       memory::Copy(boost::get<platform::CUDAPlace>(place),
                    reinterpret_cast<void*>(p), cpu, data, size_to_write,
                    gpu_dev_ctx.stream());
       p += size_to_write;
-      size -= size_to_write;
+      total_written += size_to_write;
 
       input->Skip(size_to_write);
     }
@@ -77,16 +84,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
   }
 
   char* p = reinterpret_cast<char*>(dest);
-  while (size > 0) {
+  while (total_written < length) {
     if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
       return false;
     }
+    // NOTE: if raw buffer is large and have two neighbor fields of raw buffers
+    // GetDirectBufferPointer can get all of them, use length to truncate it.
+    if (total_written + size_to_write > length) {
+      size_to_write = length - total_written;
+    }
     // TODO(gongwb): can we avoid copy?
     platform::CPUPlace cpu;
     memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
 
     p += size_to_write;
-    size -= size_to_write;
+    total_written += size_to_write;
 
     input->Skip(size_to_write);
   }
@@ -135,8 +147,13 @@ bool VariableResponse::CopySelectRowsTensorData(
     const platform::DeviceContext& ctx, framework::DDim& dims, int length) {
   auto var = scope_->FindVar(meta_.varname());
   auto* slr = var->GetMutable<framework::SelectedRows>();
+  slr->set_height(meta_.slr_height());
   auto* tensor = slr->mutable_value();
   tensor->Resize(dims);
+  PADDLE_ENFORCE_EQ(
+      tensor->numel(),
+      length / framework::SizeOfType(
+                   paddle::operators::detail::ToTypeIndex(meta_.data_type())));
   void* tensor_data = tensor->mutable_data(
       ctx.GetPlace(),
       paddle::operators::detail::ToTypeIndex(meta_.data_type()));
@@ -153,6 +170,8 @@ bool VariableResponse::CopySelectRowsData(
     const platform::DeviceContext& ctx, int length) {
   auto var = scope_->FindVar(meta_.varname());
   auto* slr = var->GetMutable<framework::SelectedRows>();
+  slr->mutable_rows()->resize(length /
+                              framework::SizeOfType(typeid(int64_t)));  // int64
   int64_t* rows_data = slr->mutable_rows()->data();
 
   // copy rows CPU data, GPU data will be copied lazily.
@@ -233,7 +252,6 @@ int VariableResponse::Parse(Source* source) {
       if (tag != 0) {
         return -1;
       }
-
       return 0;
     }
 
@@ -336,6 +354,14 @@ int VariableResponse::Parse(Source* source) {
         }
         break;
       }
+      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_slr_height(static_cast<int64_t>(v));
+        break;
+      }
       case sendrecv::VariableMessage::kSerializedFieldNumber: {
         PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
                         meta_.type() == sendrecv::LOD_TENSOR) &&
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index 94382739b5077b1449a8fd5be7952f35737ca340..184c095e487a302ebc4d251dd6f332333c415c6d 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -55,9 +55,6 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
     y->mutable_data<T>(context.GetPlace());
     float dropout_prob = context.Attr<float>("dropout_prob");
 
-    auto X = EigenMatrix<T>::Reshape(*x, 1);
-    auto Y = EigenMatrix<T>::Reshape(*y, 1);
-
     auto& place = *context.template device_context<Place>().eigen_device();
     if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
@@ -76,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
           T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
           size, seed, dropout_prob, x_data, mask_data, y_data);
     } else {
+      auto X = EigenMatrix<T>::Reshape(*x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
       Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
     }
   }
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index b5ee86ae2d11dfc835e1a3a6826ce016baf38a29..0628b4b826d2730a8e3fb4842e4ae550b8c00569 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -11,9 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include <random>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..424d273c34b7e8d70c88b591c4fe45db61465f38
--- /dev/null
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(dropout);
+
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto var = scope->Var("X");
+  auto tensor = var->GetMutable<f::LoDTensor>();
+  tensor->Resize({10, 10});
+
+  std::vector<float> init;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init.push_back(1.0);
+  }
+
+  TensorFromVector(init, ctx, tensor);
+
+  auto place = ctx.GetPlace();
+  auto out_var = scope->Var("Out");
+  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
+  out_tensor->Resize({10, 10});
+  out_tensor->mutable_data<float>(place);  // allocate
+
+  auto mask_var = scope->Var("Mask");
+  auto mask_tensor = mask_var->GetMutable<f::LoDTensor>();
+  mask_tensor->Resize({10, 10});
+  mask_tensor->mutable_data<float>(place);  // allocate
+
+  // run
+  f::AttributeMap attrs;
+  float dropout_prob = 0.5;
+  attrs.insert({"fix_seed", 1});
+  attrs.insert({"seed", 3});
+  attrs.insert({"dropout_prob", dropout_prob});
+  auto dropout_op = f::OpRegistry::CreateOp(
+      "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
+
+  dropout_op->Run(*scope, place);
+
+  std::vector<float> out_vec;
+  TensorToVector(*out_tensor, ctx, &out_vec);
+
+  std::vector<float> std_out = {
+      0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
+      1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
+      1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
+      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
+      1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1};
+
+  EXPECT_EQ(out_vec.size(), std_out.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], std_out[i]);
+  }
+}
+
+// TODO(wyi): Due to
+// https://github.com/PaddlePaddle/Paddle/issues/9507, I temporarily
+// disable this test to remove the prevention of the merge of
+// unrelated PRs.
+/*
+TEST(Dropout, CPUDense) {
+  f::Scope scope;
+  p::CPUPlace place;
+  p::CPUDeviceContext ctx(place);
+  Compare(scope, ctx);
+}
+
+TEST(Dropout, GPUDense) {
+  f::Scope scope;
+  p::CUDAPlace place;
+  p::CUDADeviceContext ctx(place);
+  Compare(scope, ctx);
+}
+*/
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index 6b5c3db13c0929ae0dd2fb2c981867df0a36c1ce..ec2e641679fedec776d48716f13445f44375ce3d 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -1,71 +1,46 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/increment_op.h"
 
 namespace paddle {
 namespace operators {
 
-class IncrementInferShape : public framework::InferShapeBase {
+class IncrementOp : public framework::OperatorWithKernel {
  public:
-  void operator()(framework::InferShapeContext *ctx) const override {
+  IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of IncrementOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of IncrementOp should not be null.");
     PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X")));
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", "Out");
   }
-};
-
-struct IncrementFunctor {
-  IncrementFunctor(const framework::LoDTensor &x, framework::LoDTensor *out,
-                   float value)
-      : x_(x), out_(out), value_(value) {}
-
-  template <typename T>
-  void operator()() const {
-    *out_->data<T>() = *x_.data<T>() + static_cast<T>(value_);
-  }
-
-  const framework::LoDTensor &x_;
-  framework::LoDTensor *out_;
-  float value_;
-};
-
-class IncrementOp : public framework::OperatorBase {
- public:
-  IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
-    auto &out =
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
 
-    PADDLE_ENFORCE(platform::is_cpu_place(x.place()));
-    out.Resize(x.dims());
-    out.mutable_data(x.place(), x.type());
-    float value = Attr<float>("step");
-    VLOG(10) << Output("Out") << " increase " << Input("X") << " with "
-             << value;
-    framework::VisitDataType(framework::ToDataType(out.type()),
-                             IncrementFunctor(x, &out, value));
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
+    // IncrementOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
   }
 };
 
@@ -108,5 +83,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementInferShape,
-                  ops::IncrementOpMaker, ops::IncrementGradOpMaker);
+REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
+                  ops::IncrementGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    increment, ops::IncrementKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::IncrementKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>)
diff --git a/paddle/fluid/operators/increment_op.cu b/paddle/fluid/operators/increment_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7fb6425fe994751c4d7a025bb62e43a84c8d95c2
--- /dev/null
+++ b/paddle/fluid/operators/increment_op.cu
@@ -0,0 +1,22 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/increment_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>)
diff --git a/paddle/fluid/operators/increment_op.h b/paddle/fluid/operators/increment_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0e8c66255ef68b975701fb6b3c145be2590e271
--- /dev/null
+++ b/paddle/fluid/operators/increment_op.h
@@ -0,0 +1,39 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IncrementKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x_tensor = context.Input<framework::Tensor>("X");
+    auto* out_tensor = context.Output<framework::Tensor>("Out");
+    float step = context.Attr<float>("step");
+
+    out_tensor->mutable_data<T>(context.GetPlace());
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
+    framework::EigenScalar<T>::From(*out_tensor).device(dev) =
+        framework::EigenScalar<T>::From(*x_tensor) + static_cast<T>(step);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 08b83375dd5462e67c3da2c6c7401dd5e54793f0..b19add24e2bd325896a96be53d3d9762abfe217c 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -13,22 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdint.h>
-#include <sys/stat.h>
 #include <ostream>
-#include <thread>
-
-#include <unistd.h>
 
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/proto_desc.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
-#include "paddle/fluid/string/printf.h"
 
 namespace paddle {
 namespace operators {
@@ -54,6 +45,24 @@ static void CreateTensorFromMessageType(framework::Variable *var,
   }
 }
 
+static void ParallelExecuteBlocks(const std::vector<size_t> &parallel_blkids,
+                                  framework::Executor *executor,
+                                  framework::ProgramDesc *program,
+                                  framework::Scope *scope) {
+  std::vector<std::future<void>> fs;
+  for (size_t idx : parallel_blkids) {
+    fs.push_back(framework::Async([&executor, &program, &scope, idx]() {
+      int run_block = idx;  // thread local
+      try {
+        executor->Run(*program, scope, run_block, false, false);
+      } catch (std::exception &e) {
+        LOG(ERROR) << "run sub program error " << e.what();
+      }
+    }));
+  }
+  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
+}
+
 class ListenAndServOp : public framework::OperatorBase {
  public:
   ListenAndServOp(const std::string &type,
@@ -70,7 +79,6 @@ class ListenAndServOp : public framework::OperatorBase {
 
   void Stop() override {
     rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
-    rpc_service_->ShutDown();
     server_thread_->join();
   }
 
@@ -94,6 +102,11 @@ class ListenAndServOp : public framework::OperatorBase {
 
     framework::Executor executor(dev_place);
 
+    // TODO(qiao) set proper fields for table lookup and update
+    rpc_service_->SetExecutor(&executor);
+    rpc_service_->SetPrefetchBlkdId(0);
+    rpc_service_->SetProgram(program);
+
     // TODO(typhoonzero): change this to a while_op for every cluster-batch.
     bool exit_flag = false;
     // Record received sparse variables, so that
@@ -135,33 +148,29 @@ class ListenAndServOp : public framework::OperatorBase {
         break;
       }
 
-      // put optimize blocks in the thread pool to start run, the last block
-      // should be global ops.
       // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
       // and this will still work.
 
-      std::vector<std::future<void>> fs;
-      // block0 contains only listen_and_serv op, start run from block1.
-      for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
-        fs.push_back(
-            framework::Async([&executor, &program, &recv_scope, blkid]() {
-              int run_block = blkid;  // thread local
-              try {
-                executor.Run(*program, &recv_scope, run_block, false, false);
-              } catch (std::exception &e) {
-                LOG(ERROR) << "run sub program error " << e.what();
-              }
-            }));
-      }
-      for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait();
-      // Run global block at final step, or block1 if there are only 2 blocks
-      if (num_blocks >= 2) {
-        try {
-          executor.Run(*program, &recv_scope, num_blocks - 1, false, false);
-        } catch (std::exception &e) {
-          LOG(ERROR) << "run sub program error " << e.what();
+      // The optimize blocks which have the same parent ID would run parallel
+      // TODO(Yancey1989): need to use ParallelExecutor for future
+      size_t last_parent_blkid = program->Block(1).Parent();
+      std::vector<size_t> parallel_blkids;
+      parallel_blkids.push_back(1);
+      double ts = detail::GetTimestamp();
+      for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
+        if (program->Block(blkid).Parent() != last_parent_blkid) {
+          for (size_t idx : parallel_blkids) VLOG(3) << idx;
+          ParallelExecuteBlocks(parallel_blkids, &executor, program,
+                                &recv_scope);
+          parallel_blkids.clear();
+          last_parent_blkid = program->Block(blkid).Parent();
         }
+        parallel_blkids.push_back(blkid);
       }
+      ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope);
+
+      VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts
+              << "(ms)";
 
       // Reset the received sparse variables, the sum operator would not
       // sum the input sparse variables which rows is empty at the next
@@ -176,10 +185,6 @@ class ListenAndServOp : public framework::OperatorBase {
       rpc_service_->WaitClientGet(fan_in);
       sparse_vars.clear();
     }  // while(true)
-
-    // for (int i = 0; i < num_blocks; ++i) {
-    //   delete blk_ctx_list[i];
-    // }
   }
 
  protected:
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 50eeadab72e71f39325c5eda69e9a3c3e6517d7d..deabcdc99f819851b2df9bb0c7b05a5b339568f3 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -18,6 +18,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+static inline framework::OpKernelType ExpectedKernelType(
+    const framework::ExecutionContext& ctx) {
+  auto* table_var = ctx.InputVar("W");
+  if (table_var->IsType<LoDTensor>()) {
+    return framework::OpKernelType(
+        framework::ToDataType(table_var->Get<LoDTensor>().type()),
+        ctx.device_context());
+  } else if (table_var->IsType<SelectedRows>()) {
+    return framework::OpKernelType(
+        framework::ToDataType(table_var->Get<SelectedRows>().value().type()),
+        ctx.device_context());
+  } else {
+    PADDLE_THROW("W should be LoDTensor or SelectedRows");
+  }
+}
+
 class LookupTableOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -51,9 +67,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
+    return ExpectedKernelType(ctx);
   }
 };
 
@@ -84,7 +98,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                      "If the value is -1, it makes no effect to lookup. "
                      "Otherwise the given value indicates padding the output "
                      "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(-1);
+        .SetDefault(kNoPadding);
     AddComment(R"DOC(
 Lookup Table Operator.
 
@@ -124,9 +138,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
+    return ExpectedKernelType(ctx);
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index c92ce78eeffb8f1517e61c6d6624d406e04d974d..fff5edda62d4b115605a4cab35ed5457b4db5f21 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -25,16 +28,37 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+static constexpr int64_t kNoPadding = -1;
+
+inline size_t getIndex(const std::vector<int64_t> &rows, int64_t value) {
+  auto it = std::find(rows.begin(), rows.end(), value);
+  PADDLE_ENFORCE(it != rows.end(), "id should be in rows");
+  return static_cast<size_t>(std::distance(rows.begin(), it));
+}
 
 template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");
-    auto* ids_var = context.InputVar("Ids");
-    Tensor* output_t = context.Output<Tensor>("Out");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    auto *ids_var = context.InputVar("Ids");
+    Tensor *output_t = context.Output<Tensor>("Out");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+
+    DDim table_dim;
 
-    int64_t* ids;
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+    }
+
+    int64_t *ids;
     int64_t ids_numel;
 
     // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
@@ -42,39 +66,50 @@ class LookupTableKernel : public framework::OpKernel<T> {
     // when Ids's type is SelectedRows, the rows of Ids contains the
     // ids to be looked up in W.
     if (ids_var->IsType<LoDTensor>()) {
-      auto* ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t*>(ids_t->data<int64_t>());
+      auto *ids_t = context.Input<LoDTensor>("Ids");
+      ids = const_cast<int64_t *>(ids_t->data<int64_t>());
       ids_numel = ids_t->numel();
     } else if (ids_var->IsType<SelectedRows>()) {
-      auto* ids_t = context.Input<SelectedRows>("Ids");
-      ids = const_cast<int64_t*>(ids_t->rows().data());
+      auto *ids_t = context.Input<SelectedRows>("Ids");
+      ids = const_cast<int64_t *>(ids_t->rows().data());
       ids_numel = ids_t->rows().size();
-      output_t->Resize({ids_numel, table_t->dims()[1]});
+      output_t->Resize({ids_numel, table_dim[1]});
     } else {
       PADDLE_THROW("Unsupported Variable Type of Ids");
     }
 
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    if (table_var->IsType<LoDTensor>()) {
+      auto *table_t = context.Input<LoDTensor>("W");
+      int64_t row_number = table_t->dims()[0];
+      int64_t row_width = table_t->dims()[1];
 
-    int N = table_t->dims()[0];
-    int D = table_t->dims()[1];
-    auto* table = table_t->data<T>();
-    auto* output = output_t->mutable_data<T>(context.GetPlace());
+      auto *table = table_t->data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
 
-    if (padding_idx == -1) {
       for (int64_t i = 0; i < ids_numel; ++i) {
-        PADDLE_ENFORCE_LT(ids[i], N);
-        PADDLE_ENFORCE_GE(ids[i], 0);
-        memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(ids[i], row_number);
+          PADDLE_ENFORCE_GE(ids[i], 0);
+          memcpy(output + i * row_width, table + ids[i] * row_width,
+                 row_width * sizeof(T));
+        }
       }
-    } else {
+    } else if (table_var->IsType<SelectedRows>()) {
+      const auto &table_t = table_var->Get<SelectedRows>();
+      int64_t row_width = table_t.value().dims()[1];
+      const auto *table = table_t.value().data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
       for (int64_t i = 0; i < ids_numel; ++i) {
-        if (ids[i] == padding_idx) {
-          memset(output + i * D, 0, D * sizeof(T));
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
         } else {
-          PADDLE_ENFORCE_LT(ids[i], N);
           PADDLE_ENFORCE_GE(ids[i], 0);
-          memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+          auto id_index = getIndex(table_t.rows(), ids[i]);
+          memcpy(output + i * row_width, table + id_index * row_width,
+                 row_width * sizeof(T));
         }
       }
     }
@@ -84,17 +119,27 @@ class LookupTableKernel : public framework::OpKernel<T> {
 template <typename T>
 class LookupTableGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    DDim table_dim;
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+    }
+
     bool is_sparse = context.Attr<bool>("is_sparse");
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
       framework::Vector<int64_t> new_rows;
@@ -104,31 +149,30 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       }
       d_table->set_rows(new_rows);
 
-      auto* d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      auto *d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_dim[0], table_dim[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
-      d_table->set_height(table->dims()[0]);
+      d_table->set_height(table_dim[0]);
 
-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table_value->data<T>();
+      auto *d_output_data = d_output->data<T>();
+      auto *d_table_data = d_table_value->data<T>();
 
       PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
       memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
     } else {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
-      auto* table = context.Input<LoDTensor>("W");
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
-      int N = table->dims()[0];
+      int N = table_dim[0];
       int D = d_output->dims()[1];
 
-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table->mutable_data<T>(context.GetPlace());
+      auto *d_output_data = d_output->data<T>();
+      auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
 
       memset(d_table_data, 0, d_table->numel() * sizeof(T));
 
diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc
index 3bead16ce44c26b9d7a6f2a5c6b471612494d595..0a18882e8199c2a375a230a693b8b01d12aabfa0 100644
--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
@@ -36,6 +36,14 @@ std::shared_ptr<T> insert_to_context(const std::string& key,
 
   return p;
 }
+
+template <typename... Args>
+void run_primitive(Args&&... args) {
+  auto forward_op = mkldnn::lrn_forward{args...};
+
+  std::vector<mkldnn::primitive> pipeline = {forward_op};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
 }  // namespace
 
 template <typename T>
@@ -87,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
                                      static_cast<void*>(output_data)};
 
-    std::unique_ptr<mkldnn::lrn_forward> forward_op = nullptr;
-
     if (!is_test) {
       const std::string key = ctx.op().Output("Out");
       const std::string key_src_memory = key + "@lrn_src_memory";
@@ -108,9 +114,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           key_workspace_memory, dev_ctx,
           forward_pd->workspace_primitive_desc());
 
-      forward_op.reset(new mkldnn::lrn_forward{*forward_pd, *src_memory,
-                                               *workspace_memory, dst_memory});
-
+      run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
     } else {
       auto forward_pd =
           mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
@@ -119,12 +123,8 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       auto workspace_memory =
           mkldnn::memory{forward_pd.workspace_primitive_desc()};
 
-      forward_op.reset(new mkldnn::lrn_forward{forward_pd, src_memory,
-                                               workspace_memory, dst_memory});
+      run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
     }
-
-    std::vector<mkldnn::primitive> pipeline = {*forward_op};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
   }
 };
 
@@ -136,6 +136,9 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                    "MKLDNN LRN must use float data.");
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "MKLDNN LRN must use CPUPlace.");
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
 
     auto x = ctx.Input<Tensor>("X");
 
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 2b1947a187bbd17871107553127647032ac7d7f9..cb1568398125bbb57da974096da527200c1e0975 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -155,8 +155,8 @@ class LRNOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
 
     ctx->SetOutputDim("Out", x_dim);
-    ctx->SetOutputDim("MidOut", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("MidOut", x_dim);
   }
 
   framework::OpKernelType GetExpectedKernelType(
@@ -214,7 +214,10 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
         "Defaults to \"NHWC\". Specify the data format of the output data, "
         "the input will be transformed automatically. ")
         .SetDefault("AnyLayout");
-    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "Turns on memory optimization that optimizes away "
+                  "unnecessary memory allocations. Used by MKLDNN.")
+        .SetDefault(false);
 
     AddComment(R"DOC(
 Local Response Normalization Operator.
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index 95796f7eecd2bcd61aab7944f557ca568b03e027..0fd3175e8579df9e61368cc151a94fa45e433884 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -121,6 +121,10 @@ class LRNGradKernel : public framework::OpKernel<T> {
     T alpha = ctx.Attr<T>("alpha");
     T beta = ctx.Attr<T>("beta");
 
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
+
     LRNGradFunctor<DeviceContext, T> f;
     f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta);
   }
diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h
index 22147d79e4b1eeee76f7445dd963bf5062049a34..c0e983e4aa7abbdd87649f5a3147d2a464993bce 100644
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index f7a6f2bdf4e3b7896df39acfa51fa20577b20f3b..5ae42ab973c81d3794fbbbe088e37ab02168c8dc 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -19,8 +19,17 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
 template <typename T>
-class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> {
+class MaxSeqPoolFunctor {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::LoDTensor& input, framework::Tensor* output,
@@ -60,7 +69,7 @@ class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> {
 };
 
 template <typename T>
-class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> {
+class MaxSeqPoolGradFunctor {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& out_grad,
@@ -93,10 +102,101 @@ class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template class MaxSeqPoolFunctor<platform::CPUDeviceContext, float>;
-template class MaxSeqPoolFunctor<platform::CPUDeviceContext, double>;
-template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, double>;
+template <typename T>
+class SequencePoolFunctor<platform::CPUDeviceContext, T> {
+ public:
+  /* max pool has index output */
+  void operator()(const platform::CPUDeviceContext& context,
+                  const std::string pooltype, const framework::LoDTensor& input,
+                  framework::Tensor* output,
+                  framework::Tensor* index = nullptr) {
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolFunctor<T> max_pool;
+      max_pool(context, input, output, index);
+      return;
+    }
+    auto lod = input.lod()[0];
+    auto& place = *context.eigen_device();
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      Tensor in_t =
+          input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
+      Tensor out_t = output->Slice(i, i + 1);
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      int64_t w = input.numel() / input.dims()[0];
+      auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
+      auto out_e = EigenVector<T>::Flatten(out_t);
+      if (pooltype == "AVERAGE") {
+        out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
+      } else if (pooltype == "SUM") {
+        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
+      } else if (pooltype == "SQRT") {
+        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
+                              std::sqrt(static_cast<T>(h));
+      } else if (pooltype == "LAST") {
+        out_e.device(place) = in_e.chip(h - 1, 0);
+      } else if (pooltype == "FIRST") {
+        out_e.device(place) = in_e.chip(0, 0);
+      } else {
+        PADDLE_THROW("unsupported pooling pooltype");
+      }
+    }
+  }
+};
+
+template <typename T>
+class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const std::string pooltype, const framework::Tensor& out_grad,
+                  framework::LoDTensor* in_grad,
+                  /* max pool has index */
+                  const framework::Tensor* index = nullptr) {
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolGradFunctor<T> max_pool_grad;
+      max_pool_grad(context, out_grad, *index, in_grad);
+      return;
+    }
+
+    if (pooltype == "LAST" || pooltype == "FIRST") {
+      // set X@Grad be zero at first when pooltype is LAST/FIRST
+      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      functor(context, in_grad, 0);
+    }
+    auto lod = in_grad->lod()[0];
+    auto& place = *context.eigen_device();
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]),
+                                   static_cast<int>(lod[i + 1]));
+      auto out_g_t = out_grad.Slice(i, i + 1);
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      int64_t w = in_grad->numel() / in_grad->dims()[0];
+      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
+      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
+      Eigen::DSizes<int, 2> bcast(h, 1);
+
+      if (pooltype == "AVERAGE") {
+        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
+      } else if (pooltype == "SUM") {
+        in_g_e.device(place) = (out_g_e).broadcast(bcast);
+      } else if (pooltype == "SQRT") {
+        in_g_e.device(place) =
+            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
+      } else if (pooltype == "LAST") {
+        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
+      } else if (pooltype == "FIRST") {
+        in_g_e.chip(0, 0).device(place) = out_g_e_v;
+      } else {
+        PADDLE_THROW("unsupported pooling pooltype");
+      }
+    }
+  }
+};
+
+template class SequencePoolFunctor<platform::CPUDeviceContext, float>;
+template class SequencePoolFunctor<platform::CPUDeviceContext, double>;
+template class SequencePoolGradFunctor<platform::CPUDeviceContext, float>;
+template class SequencePoolGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index d61407c020142f046f41f71a56702fd6106df628..1935364da37e9a9881651455d2da4ecef1b1e266 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -22,113 +23,331 @@ namespace math {
 #define FLT_MAX __FLT_MAX__
 
 template <typename T>
-__global__ void KeMaxSequencePool(const T* input, const size_t* starts,
-                                  T* output, int* index, int64_t num_seq,
-                                  int64_t dim) {
-  int dim_idx = threadIdx.x;
-  int seq_id = blockIdx.x;
-  if (seq_id >= num_seq) return;
-  size_t start = starts[seq_id];
-  size_t end = starts[seq_id + 1];
-
-  for (int64_t i = dim_idx; i < dim; i += blockDim.x) {
-    T max_val = static_cast<T>(-FLT_MAX);
-    int max_id = -1;
-    for (size_t step_id = start; step_id < end; step_id++) {
-      if (max_val < input[step_id * dim + i]) {
-        max_val = input[step_id * dim + i];
-        max_id = step_id;
+struct MaxPoolFunctor {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      T max_val = static_cast<T>(-FLT_MAX);
+      int max_index = -1;
+      for (int i = start; i < end; ++i) {
+        if (max_val < input[item_dim * i + tid]) {
+          max_val = input[item_dim * i + tid];
+          max_index = i;
+        }
       }
+      output[tid] = max_val;
+      index[tid] = max_index;
     }
-    output[seq_id * dim + i] = max_val;
-    index[seq_id * dim + i] = max_id;
   }
-}
+};
 
 template <typename T>
-class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor& input, framework::Tensor* output,
-                  framework::Tensor* index) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    auto idx_dims = index->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), static_cast<int64_t>(1));
-    PADDLE_ENFORCE_GT(out_dims.size(), 1);
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+struct AvgPoolFunctor {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      T val = static_cast<T>(0);
+      for (int i = start; i < end; ++i) {
+        val += input[item_dim * i + tid];
+      }
+      // end, start is lod, so end - start != 0
+      output[tid] = val / static_cast<T>(end - start);
     }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+  }
+};
 
-    auto starts = input.lod()[0];
-    const T* in_data = input.data<T>();
-    T* out_data = output->data<T>();
-    int* max_index = index->data<int>();
+template <typename T>
+struct SumPoolFunctor {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      T val = static_cast<T>(0);
+      for (int i = start; i < end; ++i) {
+        val += input[item_dim * i + tid];
+      }
+      output[tid] = val;
+    }
+  }
+};
 
-    int64_t num_seq = out_dims[0];
-    int64_t dim = output->numel() / num_seq;
+template <typename T>
+struct SqrtPoolFunctor {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      T val = static_cast<T>(0);
+      for (int i = start; i < end; ++i) {
+        val += input[item_dim * i + tid];
+      }
+      // end, start is lod, so end - start != 0
+      output[tid] = val / sqrt(end - start);
+    }
+  }
+};
 
-    dim3 threads(256, 1);
-    dim3 grid(num_seq, 1);
-    auto stream = context.stream();
-    KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
-        in_data, starts.CUDAData(context.GetPlace()), out_data, max_index,
-        num_seq, dim);
+template <typename T>
+struct LastPoolFunctor {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      output[tid] = input[item_dim * (end - 1) + tid];
+    }
   }
 };
 
 template <typename T>
-__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
-                                      T* in_grad, int64_t num_seq,
-                                      int64_t dim) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int col_idx = idx % dim;
-  if (idx < num_seq * dim) {
-    int step_id = max_index[idx];
-    in_grad[step_id * dim + col_idx] = out_grad[idx];
+struct FirstPoolFunctor {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      output[tid] = input[item_dim * start + tid];
+    }
   }
+};
+
+template <typename T, typename Range_OP>
+__global__ void sequence_pool_kernel(Range_OP op, const T* input,
+                                     const size_t* lod, const size_t lod_size,
+                                     const size_t item_dim, T* output,
+                                     int* index) {
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+  size_t start = lod[bid];
+  size_t end = lod[bid + 1];
+  int* index_offset = nullptr;
+  if (index != nullptr) {
+    index_offset = &index[bid * item_dim];
+  }
+  op(input, start, end, item_dim, &output[bid * item_dim], index_offset);
 }
 
 template <typename T>
-class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, T> {
+class SequencePoolFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& index,
-                  framework::LoDTensor* in_grad) {
-    auto og_dims = out_grad.dims();
-    auto idx_dims = index.dims();
-    auto ig_dims = in_grad->dims();
-    PADDLE_ENFORCE_GT(og_dims.size(), static_cast<int64_t>(1));
-    PADDLE_ENFORCE_GT(ig_dims.size(), static_cast<int64_t>(1));
-    for (int64_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+                  const std::string pooltype, const framework::LoDTensor& input,
+                  framework::Tensor* output,
+                  framework::Tensor* index = nullptr) {
+    auto lod = input.lod()[0];
+    const size_t item_dim = output->numel() / output->dims()[0];
+    dim3 threads(1024, 1);
+    dim3 grid(lod.size(), 1);
+    if (pooltype == "MAX") {
+      sequence_pool_kernel<
+          T, MaxPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          MaxPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), index->data<int>());
+    } else if (pooltype == "AVERAGE") {
+      sequence_pool_kernel<
+          T, AvgPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          AvgPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "SUM") {
+      sequence_pool_kernel<
+          T, SumPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          SumPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "SQRT") {
+      sequence_pool_kernel<
+          T, SqrtPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          SqrtPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "LAST") {
+      sequence_pool_kernel<
+          T, LastPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          LastPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "FIRST") {
+      sequence_pool_kernel<
+          T, FirstPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          FirstPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else {
+      PADDLE_THROW("unsupported pooling pooltype");
     }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+  }
+};
 
-    const T* og_data = out_grad.data<T>();
-    const int* max_index = index.data<int>();
-    T* ig_data = in_grad->data<T>();
+template <typename T>
+struct MaxPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        if (i == index[tid]) {
+          in_grad[item_dim * i + tid] = out_grad[tid];
+        } else {
+          in_grad[item_dim * i + tid] = static_cast<T>(0);
+        }
+      }
+    }
+  }
+};
 
-    SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(context, in_grad, static_cast<T>(0.0));
-    int64_t num_seq = og_dims[0];
-    int64_t dim = out_grad.numel() / num_seq;
+template <typename T>
+struct AvgPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        in_grad[item_dim * i + tid] = out_grad[tid] / (end - start);
+      }
+    }
+  }
+};
 
-    unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
-    dim3 threads(128, 1);
-    dim3 grid(blocks, 1);
-    auto stream = context.stream();
-    KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
-        og_data, max_index, ig_data, num_seq, dim);
+template <typename T>
+struct SumPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        in_grad[item_dim * i + tid] = out_grad[tid];
+      }
+    }
+  }
+};
+
+template <typename T>
+struct SqrtPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        in_grad[item_dim * i + tid] =
+            out_grad[tid] / (sqrt(static_cast<T>(end - start)));
+      }
+    }
+  }
+};
+
+template <typename T>
+struct LastPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        if (i == end - 1) {
+          in_grad[item_dim * i + tid] = out_grad[tid];
+        } else {
+          in_grad[item_dim * i + tid] = static_cast<T>(0);
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+struct FirstPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        if (i == start) {
+          in_grad[item_dim * i + tid] = out_grad[tid];
+        } else {
+          in_grad[item_dim * i + tid] = static_cast<T>(0);
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename Range_OP>
+__global__ void sequence_pool_grad_kernel(Range_OP op, const T* out_grad,
+                                          const size_t* lod,
+                                          const size_t lod_size,
+                                          const size_t item_dim, T* in_grad,
+                                          const int* index) {
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+  size_t start = lod[bid];
+  size_t end = lod[bid + 1];
+  const int* index_offset = nullptr;
+  if (index != nullptr) {
+    index_offset = &index[bid * item_dim];
+  }
+  op(&out_grad[bid * item_dim], start, end, item_dim, in_grad, index_offset);
+}
+
+template <typename T>
+class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const std::string pooltype, const framework::Tensor& out_grad,
+                  framework::LoDTensor* in_grad,
+                  /* max pool has index */
+                  const framework::Tensor* index = nullptr) {
+    auto lod = in_grad->lod()[0];
+    const size_t item_dim = in_grad->numel() / in_grad->dims()[0];
+    dim3 threads(1024, 1);
+    dim3 grid(lod.size(), 1);
+    if (pooltype == "MAX") {
+      sequence_pool_grad_kernel<
+          T, MaxPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          MaxPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), index->data<int>());
+    } else if (pooltype == "AVERAGE") {
+      sequence_pool_grad_kernel<
+          T, AvgPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          AvgPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "SUM") {
+      sequence_pool_grad_kernel<
+          T, SumPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          SumPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "SQRT") {
+      sequence_pool_grad_kernel<
+          T, SqrtPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          SqrtPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "LAST") {
+      sequence_pool_grad_kernel<
+          T, LastPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          LastPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "FIRST") {
+      sequence_pool_grad_kernel<
+          T, FirstPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          FirstPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+
+    } else {
+      PADDLE_THROW("unsupported pooling pooltype");
+    }
   }
 };
 
-template class MaxSeqPoolFunctor<platform::CUDADeviceContext, float>;
-template class MaxSeqPoolFunctor<platform::CUDADeviceContext, double>;
-template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, double>;
+// sequence pooling
+template class SequencePoolFunctor<platform::CUDADeviceContext, float>;
+template class SequencePoolFunctor<platform::CUDADeviceContext, double>;
+template class SequencePoolGradFunctor<platform::CUDADeviceContext, float>;
+template class SequencePoolGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h
index ecb76884f670df1aee64ed65c3bb0cf09c5beaff..38e780222955644c14e5bbbf16dee720c7758f5c 100644
--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -21,23 +21,23 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX __FLT_MAX__
-
 template <typename DeviceContext, typename T>
-class MaxSeqPoolFunctor {
+class SequencePoolFunctor {
  public:
-  void operator()(const DeviceContext& context,
+  /* max pool has index output */
+  void operator()(const DeviceContext& context, const std::string pooltype,
                   const framework::LoDTensor& input, framework::Tensor* output,
-                  framework::Tensor* index);
+                  framework::Tensor* index = nullptr);
 };
 
-template <typename DeviceContext, class T>
-class MaxSeqPoolGradFunctor {
+template <typename DeviceContext, typename T>
+class SequencePoolGradFunctor {
  public:
-  void operator()(const DeviceContext& context,
+  void operator()(const DeviceContext& context, const std::string pooltype,
                   const framework::Tensor& out_grad,
-                  const framework::Tensor& index,
-                  framework::LoDTensor* in_grad);
+                  framework::LoDTensor* in_grad,
+                  /* max pool has index */
+                  const framework::Tensor* index = nullptr);
 };
 
 }  // namespace math
diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
index 90f6f955cea51ded2dbb2bde459113458d7749a4..a31d64e899df33f16f707e96d7ff7b85eca8d6ea 100644
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -137,6 +137,8 @@ class NCCLTester : public ::testing::Test {
 TEST_F(NCCLTester, ncclInitOp) {}
 
 // ncclAllReduceOp with desc
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
+/*
 TEST_F(NCCLTester, ncclAllReduceOp) {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   op2->SetType("ncclAllReduce");
@@ -184,6 +186,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
     }
   }
 }
+*/
 
 // ncclReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {
@@ -236,6 +239,8 @@ TEST_F(NCCLTester, ncclReduceOp) {
 }
 
 // ncclBcastOp with desc
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
+/*
 TEST_F(NCCLTester, ncclBcastOp) {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   const int kRoot = 0;
@@ -281,3 +286,4 @@ TEST_F(NCCLTester, ncclBcastOp) {
     ASSERT_NEAR(ct[j], result, 1e-5);
   }
 }
+*/
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index 4001b9a130348b4e3ea99f3017eae6d85e41fc6e..b28c16b13fce30c6e9be9953009b53e722cf4885 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -144,7 +144,12 @@ class ParallelDoOp : public framework::OperatorBase {
       PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
                      "Only support parameter type as LoDTensor");
       auto &src = scope.FindVar(param)->Get<LoDTensor>();
-      for (size_t i = 0; i < sub_scopes.size(); ++i) {
+
+      auto *sub_scope0 = sub_scopes[0];
+      auto *dst0 = sub_scope0->Var(param)->GetMutable<LoDTensor>();
+      dst0->ShareDataWith(src);
+
+      for (size_t i = 1; i < sub_scopes.size(); ++i) {
         auto &place = places[i];
         auto *sub_scope = sub_scopes[i];
         auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09ab7da663b5ef5f099b9f65b0df661ceea0d9e2
--- /dev/null
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/send_recv_util.h"
+
+namespace paddle {
+namespace operators {
+
+class PrefetchOp : public framework::OperatorBase {
+ public:
+  PrefetchOp(const std::string& type, const framework::VariableNameMap& inputs,
+             const framework::VariableNameMap& outputs,
+             const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    auto ins = Inputs("X");
+    auto outs = Outputs("Out");
+
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+
+    auto client_var_name = Output("RPCClient");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
+                            "Can not find variable '%s' in the scope.",
+                            client_var_name);
+    auto* client_var = scope.FindVar(client_var_name);
+    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+
+    for (size_t i = 0; i < ins.size(); i++) {
+      if (NeedSend(scope, ins[i])) {
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << "to get "
+                << outs[i] << "back";
+        rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i],
+                                          outs[i]);
+      } else {
+        VLOG(3) << "don't send no-initialied variable: " << ins[i];
+      }
+    }
+    PADDLE_ENFORCE(rpc_client->Wait());
+  }
+};
+
+class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PrefetchOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) Input Id variables to be sent").AsDuplicable();
+    AddOutput("RPCClient",
+              "(RPCClient) The RPC client object which will be"
+              "initialized at most once.");
+    AddOutput("Out",
+              "(SelectedRows) result "
+              "to be fetched from parameter server")
+        .AsDuplicable();
+    AddAttr<std::vector<std::string>>(
+        "epmap",
+        "(string vector, default 127.0.0.1:6164)"
+        "Server endpoints in the order of input variables for mapping")
+        .SetDefault({"127.0.0.1:6164"});
+    AddComment(R"DOC(
+Prefetch operator
+
+This operator will send Ids variables to listen_and_serve op at
+the parameter server and fetch result back.
+)DOC");
+  }
+};
+
+class PrefetchOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output("RPCClient").front();
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+
+class PrefetchOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(prefetch, ops::PrefetchOp,
+                  paddle::framework::EmptyGradOpMaker, ops::PrefetchOpMaker,
+                  ops::PrefetchOpVarTypeInference,
+                  ops::PrefetchOpShapeInference);
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 2a5605e0d378a184ae132e657b2872279784855d..2925b8a85da1b0d19672124e49c8fd22c8b4e6bf 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
@@ -59,7 +60,9 @@ class ReadOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
     framework::ReaderHolder* reader =
-        scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
+        detail::Ref(scope.FindVar(Input("Reader")),
+                    "Cannot find reader variable %s", Input("Reader"))
+            .GetMutable<framework::ReaderHolder>();
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 76cdb794ccdb4a015ae8630940a5c26845e7a7b3..f9a8058f2a32b6736d6513b017b761a31ddc2e37 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -20,12 +20,29 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
-static constexpr size_t kDoubleBufferSize = 2;
+// 'Double buffer' means we shall maintain two batches of input data at the same
+// time. So the kCacheSize shoul be at least 2.
+static constexpr size_t kCacheSize = 2;
+// There will be two bacthes out of the channel during training:
+// 1. the one waiting to be sent to the channel
+// 2. the one just be received from the channel, which is also being used by
+// subsequent operators.
+// So the channel size should be kChacheSize - 2
+static constexpr size_t kChannelSize = 0;  // kCacheSize - 2
 
 class DoubleBufferReader : public framework::DecoratedReader {
  public:
   struct Item {
     Item() : ctx_(nullptr) {}
+    Item(Item&& b) {
+      payloads_ = std::move(b.payloads_);
+      ctx_ = std::move(b.ctx_);
+    }
+    Item& operator=(Item&& b) {
+      payloads_ = std::move(b.payloads_);
+      ctx_ = std::move(b.ctx_);
+      return *this;
+    }
 
     std::vector<framework::LoDTensor> payloads_;
     platform::DeviceContext* ctx_;
@@ -34,42 +51,44 @@ class DoubleBufferReader : public framework::DecoratedReader {
   explicit DoubleBufferReader(
       ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
       : DecoratedReader(reader), place_(target_place) {
-    for (size_t i = 0; i < kDoubleBufferSize; ++i) {
-      if (platform::is_gpu_place(place_)) {
 #ifdef PADDLE_WITH_CUDA
+    for (size_t i = 0; i < kCacheSize; ++i) {
+      if (platform::is_gpu_place(place_)) {
         ctxs_.emplace_back(new platform::CUDADeviceContext(
             boost::get<platform::CUDAPlace>(place_)));
-#endif
       }
     }
-
-    start_thread();
-  }
-
-  void start_thread() {
-    buffer_ = framework::MakeChannel<Item>(kDoubleBufferSize);
-    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
+#endif
+    StartPrefetcher();
   }
 
+  bool HasNext() const override;
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
   void ReInit() override;
 
-  ~DoubleBufferReader() {
-    buffer_->Close();
-    prefetcher_.join();
-    delete buffer_;
+  ~DoubleBufferReader() { EndPrefetcher(); }
+
+ private:
+  void StartPrefetcher() {
+    channel_ = framework::MakeChannel<Item>(kChannelSize);
+    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
   }
 
-  bool HasNext() const override;
+  void EndPrefetcher() {
+    channel_->Close();
+    if (prefetcher_.joinable()) {
+      prefetcher_.join();
+    }
+    delete channel_;
+    channel_ = nullptr;
+  }
 
- private:
   void PrefetchThreadFunc();
 
   std::thread prefetcher_;
-  framework::Channel<Item>* buffer_;
+  framework::Channel<Item>* channel_;
   platform::Place place_;
   std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
-  mutable Item local_buffer_;
 };
 
 class CreateDoubleBufferReaderOp : public framework::OperatorBase {
@@ -123,68 +142,70 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
+bool DoubleBufferReader::HasNext() const {
+  while (!channel_->IsClosed() && !channel_->CanReceive()) {
+  }
+  return channel_->CanReceive();
+}
+
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
   if (!HasNext()) {
     PADDLE_THROW("There is no next data!");
   }
 
-  if (local_buffer_.payloads_.empty()) {
-    buffer_->Receive(&local_buffer_);
-  }
-  *out = local_buffer_.payloads_;
-  local_buffer_.payloads_.clear();
-  if (local_buffer_.ctx_) {
-    local_buffer_.ctx_->Wait();
+  Item batch;
+  channel_->Receive(&batch);
+  *out = batch.payloads_;
+  if (batch.ctx_) {
+    batch.ctx_->Wait();
   }
 }
 
 void DoubleBufferReader::ReInit() {
   reader_->ReInit();
-  buffer_->Close();
-  prefetcher_.join();
-  delete buffer_;
-  start_thread();
+  EndPrefetcher();
+  StartPrefetcher();
 }
 
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
-  size_t gpu_ctx_offset = 0;
+  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
+  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
+  size_t cached_tensor_id = 0;
+
   while (reader_->HasNext()) {
     Item batch;
-    reader_->ReadNext(&batch.payloads_);
+    auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
+    reader_->ReadNext(&cpu_batch);
     if (platform::is_gpu_place(place_)) {
-      std::vector<framework::LoDTensor> gpu_batch;
-      auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++];
-      gpu_ctx_offset %= this->ctxs_.size();
-      gpu_batch.resize(batch.payloads_.size());
-      for (size_t i = 0; i < batch.payloads_.size(); ++i) {
-        framework::TensorCopy(batch.payloads_[i], place_, *gpu_ctx,
-                              &gpu_batch[i]);
-        gpu_batch[i].set_lod(batch.payloads_[i].lod());
+      auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
+      auto* gpu_ctx = ctxs_[cached_tensor_id].get();
+      gpu_batch.resize(cpu_batch.size());
+      for (size_t i = 0; i < cpu_batch.size(); ++i) {
+        framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
+        gpu_batch[i].set_lod(cpu_batch[i].lod());
       }
-      batch.ctx_ = gpu_ctx.get();
-      std::swap(gpu_batch, batch.payloads_);
+      batch.payloads_ = gpu_batch;
+      batch.ctx_ = gpu_ctx;
+    } else {
+      // CPUPlace
+      batch.payloads_ = cpu_batch;
     }
+    ++cached_tensor_id;
+    cached_tensor_id %= kCacheSize;
 
-    if (!buffer_->Send(&batch)) {
+    try {
+      channel_->Send(&batch);
+    } catch (paddle::platform::EnforceNotMet e) {
       VLOG(5) << "WARNING: The double buffer channel has been closed. The "
                  "prefetch thread will terminate.";
       break;
     }
   }
-  buffer_->Close();
+  channel_->Close();
   VLOG(5) << "Prefetch thread terminates.";
 }
 
-bool DoubleBufferReader::HasNext() const {
-  if (local_buffer_.payloads_.empty()) {
-    bool ok = buffer_->Receive(&local_buffer_);
-    return ok;
-  } else {
-    return true;
-  }
-}
-
 }  // namespace reader
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
index 4d4e9fb909eafea5328491a4097276577f28a5ba..47d9989bc8748840ec2d39587fde24355d90b6b4 100644
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -81,10 +81,10 @@ class CreateMultiPassReaderOpMaker : public DecoratedReaderMakerBase {
 
       This operator creates a multi-pass reader. A multi-pass reader 
       is used to yield data for several pass training continuously. 
-      It takes the the number of pass to run as one of its attributes
+      It takes the number of passes to run as one of its attributes
       ('pass_num'), and maintains a pass counter to record how many 
-      passes it has completed. When the underlying reader reach the EOF, 
-      the multi-pass reader checks whether it has completed training 
+      passes it has completed. When the underlying reader reaches the 
+      EOF, the multi-pass reader checks whether it has completed training 
       of the given number of pass. If not, the underlying reader will 
       be re-initialized and starts a new pass automatically.
     )DOC");
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index c4aa29c7206dbd3fe6a99b2a6c5ac6f083621944..adaa0b9e5f1ffcfbf3e9cd8fd060153575f270a6 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <mutex>
+#include <thread>
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 #include "paddle/fluid/recordio/scanner.h"
 
 namespace paddle {
 namespace operators {
 namespace reader {
+template <bool ThreadSafe>
 class RecordIOFileReader : public framework::FileReader {
  public:
   explicit RecordIOFileReader(const std::string& filename,
@@ -25,7 +28,12 @@ class RecordIOFileReader : public framework::FileReader {
       : FileReader(dims),
         scanner_(filename),
         dev_ctx_(*platform::DeviceContextPool::Instance().Get(
-            platform::CPUPlace())) {}
+            platform::CPUPlace())) {
+    if (ThreadSafe) {
+      mutex_.reset(new std::mutex());
+    }
+    LOG(INFO) << "Creating file reader" << filename;
+  }
 
   bool HasNext() const override { return scanner_.HasNext(); }
 
@@ -33,10 +41,16 @@ class RecordIOFileReader : public framework::FileReader {
 
  protected:
   void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
-    *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+    if (ThreadSafe) {
+      std::lock_guard<std::mutex> guard(*mutex_);
+      *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+    } else {
+      *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+    }
   }
 
  private:
+  std::unique_ptr<std::mutex> mutex_;
   recordio::Scanner scanner_;
   const platform::DeviceContext& dev_ctx_;
 };
@@ -59,8 +73,9 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
 
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(
-        new RecordIOFileReader(filename, RestoreShapes(shape_concat, ranks)));
+
+    out->Reset(new RecordIOFileReader<true>(
+        filename, RestoreShapes(shape_concat, ranks)));
   }
 };
 
@@ -87,4 +102,4 @@ REGISTER_FILE_READER_OPERATOR(create_recordio_file_reader,
                               reader::CreateRecordIOReaderOp,
                               reader::CreateRecordIOReaderOpMaker);
 
-REGISTER_FILE_READER(recordio, reader::RecordIOFileReader);
+REGISTER_FILE_READER(recordio, reader::RecordIOFileReader<false>);
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 414c76fea0bb916dfeafe38c0448a7a800889e03..b6ac7b21d56f7760b3f4814581c90b0ff2cc4a6a 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -146,14 +146,19 @@ void MultipleReader::PrefetchThreadFunc(std::string file_name,
   while (reader->HasNext()) {
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
-    if (!buffer_->Send(&ins)) {
+    try {
+      buffer_->Send(&ins);
+    } catch (paddle::platform::EnforceNotMet e) {
       VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
                  "thread of file '"
               << file_name << "' will terminate.";
       break;
     }
   }
-  if (!available_thread_idx_->Send(&thread_idx)) {
+
+  try {
+    available_thread_idx_->Send(&thread_idx);
+  } catch (paddle::platform::EnforceNotMet e) {
     VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
                "Fail to send thread_idx.";
   }
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 832509641cc3d5178ff090e05437484d395bfe51..b87b8e6b26cdeb017e700870998a53c1b295988c 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -17,90 +17,66 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-class ReshapeOp : public framework::OperatorWithKernel {
- public:
-  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReshapeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReshapeOp should not be null.");
-
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
-    auto x_dims = ctx->GetInputDim("X");
-
-    std::vector<size_t> neg_dims_idx;
-    // set some dimension to -1 if it is unknown
-    const int unknown_size = -1;
-    for (size_t i = 0; i < shape.size(); ++i) {
-      PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
-                     "Each dimension of Attr(shape) must be positive or %d.",
-                     unknown_size);
-      if (shape[i] == unknown_size) {
-        neg_dims_idx.push_back(i);
-        PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
-                       "Only one dimension of Attr(shape) can be unknown.");
-      }
-    }
-
-    int64_t capacity =
-        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
-    int64_t in_size = framework::product(x_dims);
-    if (neg_dims_idx.size() == 1) {
-      // dim infer
-      shape[neg_dims_idx[0]] = in_size / (-capacity);
-      // recalculate capacity
-      capacity = shape[neg_dims_idx[0]] * (-capacity);
-    }
-    // capacity check
-    PADDLE_ENFORCE(capacity == in_size,
-                   "The size of Input(X) mismatches with Attr(shape).");
-    // resize output
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto out_dims = framework::make_ddim(shape_int64);
-    ctx->SetOutputDim("Out", out_dims);
-    if (shape[0] == x_dims[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-};
-
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of reshape operator.");
-    AddOutput("Out", "The output tensor of reshape operator.");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) "
-                              "Target shape of reshape operator.");
+    AddInput("X", "(Tensor). The input tensor of reshape operator.");
+    AddInput("Shape",
+             "(Tensor<int32>, optional). If provided, reshape according to "
+             "this given shape. That is to say it has a higher priority than "
+             "the shape attribute, while the shape attribute still should be "
+             "set correctly to gurantee shape inference in compile time.")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
+    AddAttr<std::vector<int>>(
+        "shape", "(std::vector<int>) Target shape of reshape operator.");
     AddAttr<bool>("inplace",
-                  "Change the source tensor's shape without copy memory.")
-        .SetDefault(true);
+                  "(default: false) Change the source tensor's shape without "
+                  "memory copy. When Attr(inplace) is set true, the output "
+                  "tensor shares memory with Input(X), otherwise, a new output "
+                  "tensor is created, and its data are copied from Input(x).")
+        .SetDefault(false);
     AddComment(R"DOC(
 Reshape Operator.
 
-Reshape Input(X) into the shape specified by Attr(shape).
+Reshape Input(X) into the shape specified by Attr(shape) or Input(Shape). The
+data in Input(X) are unchanged.
+
+Examples:
 
-An example:
-Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
+1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [6, 8], the reshape operator will transform Input(X)
+into a 2-D tensor with shape [6, 8] and leaving Input(X)'s data unchanged.
 
-and target shape = [1, 4], the reshape operator will transform
-the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
+2. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 3, 4, 2] and leaving Input(X)'s data
+unchanged. In this case, one and only dimension of Attr(shape) can be set to -1,
+the value of this dimension is inferred from the total element number of
+Input(X) and remaining dimensions.
+
+3. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 4, 3, 2] and leaving Input(X)'s data
+unchanged. In this case, besides -1, 0 means the actual dimension value is going
+to be copied from the corresponding dimension of Input(X).
+
+Note:
+
+1. One and only one dimension in Attr(shape) can be set -1. In this case,
+the actual dimension value will be infered from the total element number of
+Input(X) and remaining dimensions.
+
+2. More than one dimensions in Attr(shape) can be set to 0, which means the real
+dimension value will be copied from Input(X) at runtime. Note that the index of
+0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape
+[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
+
+3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
+Attr(shape) still should be set correctly to gurantee shape inference in 
+compile-time.
 
-One dimension in the target shape can be set -1, representing that its
-size is unknown. In this case, the real dimension will be infered from 
-the original shape of Input(X) and other dimensions in the target shape.
 )DOC");
   }
 };
@@ -119,6 +95,14 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
                    "Input(Out@GRAD) shouldn't be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index eacb0a0cf21a60ffbdef5787434859ac549388bc..871b4d38d56f10f3c0c178caa566508ab75f316c 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -20,17 +20,129 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+class ReshapeOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReshapeOp should not be null.");
+
+    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(!shape.empty(),
+                   "The shape information must be set by Attr(shape).");
+
+    if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
+      // If true, set the shape of Output(Out) according to Input(Shape) in
+      // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
+      ctx->ShareLoD("X", /*->*/ "Out");
+      return;
+    }
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ValidateShape(shape, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+
+  static framework::DDim ValidateShape(const std::vector<int> shape,
+                                       const framework::DDim &in_dims) {
+    const int64_t in_size = framework::product(in_dims);
+    // only one dimension canbe set to -1, whose size will be automatically
+    // infered.
+    const int64_t unk_dim_val = -1;
+    const int64_t copy_dim_val = 0;
+
+    std::vector<int64_t> output_shape(shape.size(), 0);
+    int64_t capacity = 1;
+    int unk_dim_idx = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (shape[i] == unk_dim_val) {
+        PADDLE_ENFORCE(
+            unk_dim_idx == -1,
+            "Only one input dimension of Attr(shape) can be unknown.");
+        unk_dim_idx = i;
+      } else if (shape[i] == copy_dim_val) {
+        PADDLE_ENFORCE(
+            static_cast<int>(i) < in_dims.size(),
+            "The index of dimension to copy from input shape must be less "
+            "than the size of input shape.");
+      } else {
+        PADDLE_ENFORCE(
+            shape[i] > 0,
+            "Each input dimension of Attr(shape) must not be negtive except "
+            "one unknown dimension.");
+      }
+
+      capacity *= (shape[i] ? shape[i] : in_dims[i]);
+      output_shape[i] =
+          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
+    }
+
+    if (unk_dim_idx != -1) {
+      output_shape[unk_dim_idx] = -in_size / capacity;
+      PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
+                        "Invalid shape is given.");
+    } else {
+      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
+    }
+    return framework::make_ddim(output_shape);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
 template <typename DeviceContext, typename T>
 class ReshapeKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* in = ctx.Input<framework::Tensor>("X");
+  void Compute(const framework::ExecutionContext &ctx) const {
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *in = ctx.Input<framework::LoDTensor>("X");
+    auto *shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
+
+    framework::DDim out_dims = out->dims();
+    if (shape_tensor) {
+      auto *shape_data = shape_tensor->data<int>();
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        framework::Tensor cpu_shape_tensor;
+        TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(),
+                   &cpu_shape_tensor);
+        shape_data = cpu_shape_tensor.data<int>();
+      }
+      auto shape =
+          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
+      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
+    }
+    if (!in->lod().empty()) {
+      PADDLE_ENFORCE_EQ(
+          out_dims[0], in->dims()[0],
+          "Reshape operator cannot reshape an input sequence batch "
+          "into an output sequence batch that has a different "
+          "number of time steps. Please consider using "
+          "sequence_reshape op.");
+    }
+
     bool inplace = ctx.Attr<bool>("inplace");
-    auto out_dims = out->dims();
+    out->Resize(out_dims);
     if (!inplace) {
       out->mutable_data<T>(ctx.GetPlace());
       framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+      // TensorCopy will resize to in_dims.
       out->Resize(out_dims);
     } else {
       out->ShareDataWith(*in);
@@ -42,9 +154,10 @@ class ReshapeKernel : public framework::OpKernel<T> {
 template <typename DeviceContext, typename T>
 class ReshapeGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+  void Compute(const framework::ExecutionContext &ctx) const {
+    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
     d_x->mutable_data<T>(ctx.GetPlace());
     bool inplace = ctx.Attr<bool>("inplace");
 
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index fdf3c06ef0a7c2daa7c484375065ac2110e07478..d47f66de2161dce7ed162db4c2e23859e19596cb 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -12,35 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <future>
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-
-#include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/send_recv_util.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
-static bool NeedSend(const framework::Scope& scope,
-                     const std::string& varname) {
-  auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
-                          varname);
-  if (var->IsType<framework::LoDTensor>()) {
-    return var->Get<framework::LoDTensor>().IsInitialized();
-  } else if (var->IsType<framework::SelectedRows>()) {
-    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
-  } else {
-    PADDLE_THROW(
-        "Variable type in send side should be in "
-        "[LodTensor, SelectedRows]");
-  }
-  return false;
-}
 
 class SendOp : public framework::OperatorBase {
  public:
@@ -72,7 +56,7 @@ class SendOp : public framework::OperatorBase {
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
-        VLOG(2) << "sending " << ins[i] << " to " << epmap[i];
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
         rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
@@ -81,7 +65,7 @@ class SendOp : public framework::OperatorBase {
     PADDLE_ENFORCE(rpc_client->Wait());
 
     for (auto& ep : endpoints) {
-      VLOG(2) << "batch barrier, ep: " << ep;
+      VLOG(3) << "batch barrier, ep: " << ep;
       rpc_client->AsyncSendBatchBarrier(ep);
     }
     PADDLE_ENFORCE(rpc_client->Wait());
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index e9fb845b475ff5776bf948ab120a44c16ed87aa0..04392b3e05fa2d8b602946ba03672bf2491dcfbc 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -122,7 +122,8 @@ void StartServerNet(bool is_sparse) {
 
   // sub program run in listen_and_serv_op, for simple test we use sum
   f::ProgramDesc program;
-  f::BlockDesc *optimize_block = program.MutableBlock(0);
+  const auto &root_block = program.Block(0);
+  auto *optimize_block = program.AppendBlock(root_block);
   // X for server side tensors, RX for received tensers, must be of same shape.
   AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
 
diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..196f56f6340a75b599b8dd15957dfe6835f9bf59
--- /dev/null
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+namespace paddle {
+namespace operators {
+
+inline bool NeedSend(const framework::Scope& scope,
+                     const std::string& varname) {
+  auto* var = scope.FindVar(varname);
+  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
+                          varname);
+  if (var->IsType<framework::LoDTensor>()) {
+    return var->Get<framework::LoDTensor>().IsInitialized();
+  } else if (var->IsType<framework::SelectedRows>()) {
+    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
+  } else {
+    PADDLE_THROW(
+        "Variable type in send side should be in "
+        "[LodTensor, SelectedRows]");
+  }
+  return false;
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc
index 523e9e27808e428acb7900fe90a29de80f316bfb..2cbd9e2394800dc3b9c5be1163d16bbec435c533 100644
--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
@@ -12,34 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <future>
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-
-#include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/send_recv_util.h"
 
 namespace paddle {
 namespace operators {
-static bool NeedSend(const framework::Scope& scope,
-                     const std::string& varname) {
-  auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
-                          varname);
-  if (var->IsType<framework::LoDTensor>()) {
-    return var->Get<framework::LoDTensor>().IsInitialized();
-  } else if (var->IsType<framework::SelectedRows>()) {
-    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
-  } else {
-    PADDLE_THROW(
-        "Variable type in send side should be in "
-        "[LodTensor, SelectedRows]");
-  }
-  return false;
-}
 
 class SendVarsOp : public framework::OperatorBase {
  public:
@@ -95,7 +78,7 @@ Send operator
 
 This operator will send variables to listen_and_serve op at the parameter server.
 )DOC");
-    AddAttr<int>("ync_send",
+    AddAttr<int>("sync_send",
                  "(int, default 0)"
                  "sync send or async send.")
         .SetDefault(0);
diff --git a/paddle/fluid/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h
index 8706ff14aa20714e77d5625fc1f6287ee9b4a8a6..c58d677c92b7a20eb54dc5f9a447566e91bdc3d4 100644
--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
@@ -23,12 +23,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T>
 class SequencePoolKernel : public framework::OpKernel<T> {
@@ -37,11 +31,13 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<Tensor>("Out");
     std::string pooltype = context.Attr<std::string>("pooltype");
+    Tensor* index = nullptr;
+    if (pooltype == "MAX") {
+      index = context.Output<Tensor>("MaxIndex");
+    }
 
     auto dims = in->dims();
     auto lod = in->lod();
-    int64_t w = in->numel() / dims[0];
-
     // InferShape by lod
     PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
     PADDLE_ENFORCE_GE(
@@ -50,45 +46,14 @@ class SequencePoolKernel : public framework::OpKernel<T> {
         "The first dimension of Input(X) must be large than batch size.");
     dims[0] = lod[0].size() - 1;
     out->Resize({dims});
-
-    auto lod_level_0 = lod[0];
-
     out->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (pooltype == "MAX") {
-      math::MaxSeqPoolFunctor<DeviceContext, T> max_pool;
-      auto* index = context.Output<Tensor>("MaxIndex");
       index->Resize({dims});
       index->mutable_data<int>(context.GetPlace());
-      max_pool(dev_ctx, *in, out, index);
-      return;
-    }
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-      Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
-                              static_cast<int>(lod_level_0[i + 1]));
-      Tensor out_t = out->Slice(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod_level_0[i + 1] - lod_level_0[i]);
-      auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
-      auto out_e = EigenVector<T>::Flatten(out_t);
-
-      if (pooltype == "AVERAGE") {
-        out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
-      } else if (pooltype == "SUM") {
-        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
-      } else if (pooltype == "SQRT") {
-        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
-                              std::sqrt(static_cast<T>(h));
-      } else if (pooltype == "LAST") {
-        out_e.device(place) = in_e.chip(h - 1, 0);
-      } else if (pooltype == "FIRST") {
-        out_e.device(place) = in_e.chip(0, 0);
-      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
-      }
     }
+    math::SequencePoolFunctor<DeviceContext, T> pool;
+    pool(context.template device_context<DeviceContext>(), pooltype, *in, out,
+         index);
   }
 };
 
@@ -96,58 +61,17 @@ template <typename DeviceContext, typename T>
 class SequencePoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
     auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
     std::string pooltype = context.Attr<std::string>("pooltype");
-
-    auto dims = in->dims();
-    auto lod = in->lod()[0];
-    int64_t w = in->numel() / dims[0];
-
-    in_g->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
+    const Tensor* index = nullptr;
     if (pooltype == "MAX") {
-      math::MaxSeqPoolGradFunctor<DeviceContext, T> max_pool_grad;
-      auto* index = context.Input<Tensor>("MaxIndex");
-      max_pool_grad(dev_ctx, *out_g, *index, in_g);
-      return;
-    }
-
-    if (pooltype == "LAST" || pooltype == "FIRST") {
-      // set X@Grad be zero at first when pooltype is LAST/FIRST
-      math::SetConstant<DeviceContext, T> functor;
-      functor(dev_ctx, in_g, 0);
-    }
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      auto in_g_t =
-          in_g->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
-      auto out_g_t = out_g->Slice(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
-      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
-      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
-      Eigen::DSizes<int, 2> bcast(h, 1);
-
-      if (pooltype == "AVERAGE") {
-        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
-      } else if (pooltype == "SUM") {
-        in_g_e.device(place) = (out_g_e).broadcast(bcast);
-      } else if (pooltype == "SQRT") {
-        in_g_e.device(place) =
-            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
-      } else if (pooltype == "LAST") {
-        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
-      } else if (pooltype == "FIRST") {
-        in_g_e.chip(0, 0).device(place) = out_g_e_v;
-      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
-      }
+      index = context.Input<Tensor>("MaxIndex");
     }
+    in_g->mutable_data<T>(context.GetPlace());
+    math::SequencePoolGradFunctor<DeviceContext, T> pool;
+    pool(context.template device_context<DeviceContext>(), pooltype, *out_g,
+         in_g, index);
   }
 };
 
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..29990043206509e4192bfff84832f09ef127d9dd
--- /dev/null
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -0,0 +1,137 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <thread>
+#include <typeindex>
+#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+inline ncclDataType_t ToNCCLDataType(std::type_index type) {
+  if (type == typeid(float)) {  // NOLINT
+    return ncclFloat;
+  } else if (type == typeid(double)) {  // NOLINT
+    return ncclDouble;
+  } else if (type == typeid(int)) {  // NOLINT
+    return ncclInt;
+  } else {
+    PADDLE_THROW("Not supported");
+  }
+}
+
+class NCCLGroupGuard {
+ public:
+  inline NCCLGroupGuard() {
+    mutex().lock();
+    PADDLE_ENFORCE(dynload::ncclGroupStart());
+  }
+
+  inline ~NCCLGroupGuard() {
+    PADDLE_ENFORCE(dynload::ncclGroupEnd());
+    mutex().unlock();
+  }
+
+ private:
+  static std::mutex &mutex() {
+    static std::mutex mtx;
+    return mtx;
+  }
+};
+
+struct NCCLContext {
+  std::unique_ptr<CUDADeviceContext> ctx_;
+  ncclComm_t comm_;
+
+  explicit NCCLContext(int dev_id)
+      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))) {}
+
+  cudaStream_t stream() const { return ctx_->stream(); }
+
+  int device_id() const {
+    return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
+  }
+
+  static void InitNCCLContext(std::unordered_map<int, NCCLContext> &contexts,
+                              const std::vector<platform::Place> &places) {
+    std::vector<ncclComm_t> comms;
+    std::vector<int> devs;
+    comms.resize(contexts.size());
+    devs.reserve(contexts.size());
+
+    for (auto &p : places) {
+      devs.push_back(boost::get<platform::CUDAPlace>(p).device);
+    }
+
+    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+        &comms[0], static_cast<int>(contexts.size()), &devs[0]));
+
+    int i = 0;
+    for (auto &dev_id : devs) {
+      contexts.at(dev_id).comm_ = comms[i++];
+    }
+  }
+};
+
+struct NCCLContextMap {
+  std::unordered_map<int, NCCLContext> contexts_;
+  std::vector<int> order_;
+
+  NCCLContextMap(const std::vector<platform::Place> &places) {
+    order_.reserve(places.size());
+    for (auto &p : places) {
+      int dev_id = boost::get<CUDAPlace>(p).device;
+      order_.emplace_back(dev_id);
+      contexts_.emplace(dev_id, NCCLContext(dev_id));
+    }
+    PADDLE_ENFORCE_EQ(
+        order_.size(), contexts_.size(),
+        "NCCL Context Map does not support contain two or more same device");
+
+    std::vector<ncclComm_t> comms;
+    comms.resize(order_.size());
+
+    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+        &comms[0], static_cast<int>(order_.size()), &order_[0]));
+
+    int i = 0;
+    for (auto &dev_id : order_) {
+      contexts_.at(dev_id).comm_ = comms[i++];
+    }
+  }
+
+  CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
+
+  CUDADeviceContext *DevCtx(platform::Place p) const {
+    return DevCtx(boost::get<CUDAPlace>(p).device);
+  }
+
+  const NCCLContext &at(platform::Place p) const {
+    return this->at(boost::get<CUDAPlace>(p).device);
+  }
+
+  const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
+
+  void WaitAll() {
+    for (auto &p : contexts_) {
+      p.second.ctx_->Wait();
+    }
+  }
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index fc77e0f3213da776e0b05ad5b5da9081665cdf6e..45cc271bb888fc3a07ecc5daea6b549cb88b6d21 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_CUDA
+#include "cuda_runtime.h"
+#endif
 #include "gtest/gtest.h"
 
 TEST(Event, CpuElapsedTime) {
@@ -157,3 +160,13 @@ TEST(RecordEvent, RecordEvent) {
   // Will remove parsing-related code from test later
   DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler");
 }
+
+#ifdef PADDLE_WITH_CUDA
+TEST(TMP, stream_wait) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  cudaStreamSynchronize(stream);
+  cudaStreamSynchronize(stream);
+  cudaStreamSynchronize(stream);
+}
+#endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index fe991033dfc2a6ccc66b0ca5588fe8f808d1eb43..ada69ea4a425f70dc085ad9046bb6b930136803d 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -3,11 +3,13 @@ if(WITH_PYTHON)
     hip_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
       DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+           parallel_executor
       ${GLOB_OP_LIB})
   else()
     cc_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
       DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+           parallel_executor
       ${GLOB_OP_LIB})
     if(NOT APPLE AND NOT ANDROID)
       target_link_libraries(paddle_pybind rt)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 6c05442466f5f3d8e04a8f0a2206443b1007a107..b0a3f06a8871b1dc8c6c9d7231dfe2c9764ade3f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -496,6 +497,20 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("reset_profiler", platform::ResetProfiler);
 
+  py::class_<ParallelExecutor>(m, "ParallelExecutor")
+      .def("__init__",
+           [](ParallelExecutor &self, size_t num_threads, bool use_event,
+              const std::vector<platform::Place> &places,
+              const std::unordered_set<std::string> &params,
+              const ProgramDesc &startup_program,
+              const ProgramDesc &main_program, const std::string &loss_var_name,
+              Scope *scope, bool allow_op_delay) {
+             new (&self) ParallelExecutor(num_threads, use_event, places,
+                                          params, startup_program, main_program,
+                                          loss_var_name, scope, allow_op_delay);
+           })
+      .def("run", &ParallelExecutor::Run);
+
   BindRecordIOWriter(m);
   return m.ptr();
 }
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 322f72e4a58c7e8f2c26d994477cbb55551c595a..4885b74e6c6644704cff01dbf49975d6e87ce0c4 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -53,6 +53,7 @@ function cmake_gen() {
         -DWITH_FAST_BUNDLE_TEST=ON
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -78,6 +79,7 @@ EOF
         -DWITH_TESTING=${WITH_TESTING:-ON} \
         -DWITH_FAST_BUNDLE_TEST=ON \
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
+        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 }
 
@@ -102,7 +104,9 @@ EOF
         # make install should also be test when unittest
         make install -j `nproc`
         pip install /usr/local/opt/paddle/share/wheels/*.whl
-        paddle version
+        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
+            paddle version
+        fi
     fi
 }
 
@@ -123,9 +127,8 @@ EOF
             -DWITH_AVX=${WITH_AVX:-ON} \
             -DWITH_SWIG_PY=ON \
             -DWITH_STYLE_CHECK=OFF
-        make -j `nproc` gen_proto_py framework_py_proto
-        make -j `nproc` copy_paddle_pybind
-        make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
+
+        make -j `nproc` paddle_docs paddle_apis
         popd
     fi
 
@@ -182,6 +185,14 @@ EOF
         NCCL_DEPS=""
     fi
 
+    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
+        PADDLE_VERSION="paddle version"
+        CMD='"paddle", "version"'
+    else
+        PADDLE_VERSION="true"
+        CMD='"true"'
+    fi
+
     cat >> /paddle/build/Dockerfile <<EOF
     ADD python/dist/*.whl /
     # run paddle version to install python packages first
@@ -191,7 +202,7 @@ EOF
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
-        paddle version && \
+        ${PADDLE_VERSION} && \
         ldconfig
     ${DOCKERFILE_CUDNN_DSO}
     ${DOCKERFILE_GPU_ENV}
@@ -199,7 +210,7 @@ EOF
     ADD go/cmd/pserver/pserver /usr/bin/
     ADD go/cmd/master/master /usr/bin/
     # default command shows the paddle version and exit
-    CMD ["paddle", "version"]
+    CMD [${CMD}]
 EOF
 }
 
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 80fa0c72af65cbdc21ba955389318a233e02657c..1283de9d957a46b848c7bb6caf9c5f49398468e2 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -153,9 +153,15 @@ if [ $? -ne 0 ]; then
     exit 1
 fi
 
-INSTALLED_VERSION=`pip freeze 2>/dev/null | grep '^paddle' | sed 's/.*==//g'`
+if [ "@WITH_GPU@" == "ON" ]; then
+    PADDLE_NAME="paddlepaddle-gpu"
+else 
+    PADDLE_NAME="paddlepaddle"
+fi
+
+INSTALLED_VERSION=`pip freeze 2>/dev/null | grep "^${PADDLE_NAME}==" | sed 's/.*==//g'`
 
-if [ -z ${INSTALLED_VERSION} ]; then
+if [ -z "${INSTALLED_VERSION}" ]; then
    INSTALLED_VERSION="0.0.0"  # not installed
 fi
 cat <<EOF | python -
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index c3892491725dc960375f3f2d8fdda7f39dc84d04..d7527d99482bfe93a06e0de150a6c1ece36addde 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -7,9 +7,8 @@ cd $TRAVIS_BUILD_DIR/build
 
 # Compile Documentation only.
 cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -DWITH_STYLE_CHECK=OFF
-make -j `nproc` gen_proto_py framework_py_proto
-make -j `nproc` copy_paddle_pybind
-make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
+
+make -j `nproc` paddle_docs paddle_apis
 
 # check websites for broken links
 linkchecker doc/v2/en/html/index.html
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 90c2dfbba78418fb7b731f5363017d70577b1ae5..d074b0136d77fa5a1ce5c29cd52347d04475b029 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,7 +4,7 @@ set(PY_FILES paddle/__init__.py
   ${UTILS_PY_FILES}
   ${FLUID_PY_FILES})
 
-if(NOT WITH_FLUID)
+if(NOT WITH_FLUID_ONLY)
   file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
   file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
   file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
@@ -62,7 +62,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
-if(NOT WITH_FLUID)
+if(NOT WITH_FLUID_ONLY)
     set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model)
     if(WITH_SWIG_PY)
         list(APPEND paddle_python_deps python_api_wheel)
@@ -73,13 +73,15 @@ add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 
 if (WITH_TESTING)
-  if(NOT WITH_FLUID)
+  add_subdirectory(paddle/reader/tests)
+  add_subdirectory(paddle/dataset/tests)
+  if(NOT WITH_FLUID_ONLY)
     add_subdirectory(paddle/trainer_config_helpers/tests)
     if (WITH_SWIG_PY)
       # enable v2 API unittest only when paddle swig api is compiled
       add_subdirectory(paddle/v2/tests)
-      add_subdirectory(paddle/v2/reader/tests)
       add_subdirectory(paddle/v2/plot/tests)
+      add_subdirectory(paddle/v2/reader/tests)
     endif()
   endif()
   add_subdirectory(paddle/fluid/tests)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 1030c94e16376c326cb8b32926b8c47625cd38f0..d1cf04161ae4444ebc7da7fbc20e37dafe6c0fb1 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -14,8 +14,14 @@
 try:
     from version import full_version as __version__
     from version import commit as __git_commit__
+
 except ImportError:
     import sys
     sys.stderr.write('''Warning with import paddle: you should not 
      import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
                      )
+
+import reader
+import dataset
+import batch
+batch = batch.batch
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..317cf037c69f8639e3760fbfce20565127794fcb
--- /dev/null
+++ b/python/paddle/batch.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['batch']
+
+
+def batch(reader, batch_size):
+    """
+    Create a batched reader.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param batch_size: size of each mini-batch
+    :type batch_size: int
+    :return: the batched reader.
+    :rtype: callable
+    """
+
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == batch_size:
+                yield b
+                b = []
+        if b:
+            yield b
+
+    return batch_reader
diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3315e826e82a33dfeb9c5223ce196cffb1ae7234
--- /dev/null
+++ b/python/paddle/dataset/__init__.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Dataset package.
+"""
+
+import mnist
+import imikolov
+import imdb
+import cifar
+import movielens
+import conll05
+import uci_housing
+import sentiment
+import wmt14
+import wmt16
+import mq2007
+import flowers
+import voc2012
+import image
+
+__all__ = [
+    'mnist',
+    'imikolov',
+    'imdb',
+    'cifar',
+    'movielens',
+    'conll05',
+    'sentiment',
+    'uci_housing',
+    'wmt14',
+    'wmt16',
+    'mq2007',
+    'flowers',
+    'voc2012',
+    'image',
+]
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f4dcbdab2fecf84a0a7042a48a8c8a9e5f880d
--- /dev/null
+++ b/python/paddle/dataset/cifar.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CIFAR dataset.
+
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.
+
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.
+
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.
+
+"""
+
+import cPickle
+import itertools
+import numpy
+import paddle.dataset.common
+import tarfile
+
+__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+
+def reader_creator(filename, sub_name):
+    def read_batch(batch):
+        data = batch['data']
+        labels = batch.get('labels', batch.get('fine_labels', None))
+        assert labels is not None
+        for sample, label in itertools.izip(data, labels):
+            yield (sample / 255.0).astype(numpy.float32), int(label)
+
+    def reader():
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
+
+            for name in names:
+                batch = cPickle.load(f.extractfile(name))
+                for item in read_batch(batch):
+                    yield item
+
+    return reader
+
+
+def train100():
+    """
+    CIFAR-100 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 99].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'train')
+
+
+def test100():
+    """
+    CIFAR-100 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'test')
+
+
+def train10():
+    """
+    CIFAR-10 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch')
+
+
+def test10():
+    """
+    CIFAR-10 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'test_batch')
+
+
+def fetch():
+    paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train100(), 1000, "cifar_train100")
+    paddle.dataset.common.convert(path, test100(), 1000, "cifar_test100")
+    paddle.dataset.common.convert(path, train10(), 1000, "cifar_train10")
+    paddle.dataset.common.convert(path, test10(), 1000, "cifar_test10")
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..68660601c161d2332b17b448fae089506238ba78
--- /dev/null
+++ b/python/paddle/dataset/common.py
@@ -0,0 +1,236 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import hashlib
+import os
+import errno
+import shutil
+import sys
+import importlib
+import paddle.dataset
+import cPickle
+import glob
+import cPickle as pickle
+
+__all__ = [
+    'DATA_HOME',
+    'download',
+    'md5file',
+    'split',
+    'cluster_files_reader',
+    'convert',
+]
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
+
+
+# When running unit tests, there could be multiple processes that
+# trying to create DATA_HOME directory simultaneously, so we cannot
+# use a if condition to check for the existence of the directory;
+# instead, we use the filesystem as the synchronization mechanism by
+# catching returned errors.
+def must_mkdirs(path):
+    try:
+        os.makedirs(DATA_HOME)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+        pass
+
+
+must_mkdirs(DATA_HOME)
+
+
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    f = open(fname, "rb")
+    for chunk in iter(lambda: f.read(4096), b""):
+        hash_md5.update(chunk)
+    f.close()
+    return hash_md5.hexdigest()
+
+
+def download(url, module_name, md5sum, save_name=None):
+    dirname = os.path.join(DATA_HOME, module_name)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    filename = os.path.join(dirname,
+                            url.split('/')[-1]
+                            if save_name is None else save_name)
+
+    retry = 0
+    retry_limit = 3
+    while not (os.path.exists(filename) and md5file(filename) == md5sum):
+        if os.path.exists(filename):
+            print "file md5", md5file(filename), md5sum
+        if retry < retry_limit:
+            retry += 1
+        else:
+            raise RuntimeError("Cannot download {0} within retry limit {1}".
+                               format(url, retry_limit))
+        print "Cache file %s not found, downloading %s" % (filename, url)
+        r = requests.get(url, stream=True)
+        total_length = r.headers.get('content-length')
+
+        if total_length is None:
+            with open(filename, 'w') as f:
+                shutil.copyfileobj(r.raw, f)
+        else:
+            with open(filename, 'w') as f:
+                dl = 0
+                total_length = int(total_length)
+                for data in r.iter_content(chunk_size=4096):
+                    dl += len(data)
+                    f.write(data)
+                    done = int(50 * dl / total_length)
+                    sys.stdout.write("\r[%s%s]" % ('=' * done,
+                                                   ' ' * (50 - done)))
+                    sys.stdout.flush()
+
+    return filename
+
+
+def fetch_all():
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.dataset)):
+        if "fetch" in dir(
+                importlib.import_module("paddle.dataset.%s" % module_name)):
+            getattr(
+                importlib.import_module("paddle.dataset.%s" % module_name),
+                "fetch")()
+
+
+def fetch_all_recordio(path):
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.dataset)):
+        if "convert" in dir(
+                importlib.import_module("paddle.dataset.%s" % module_name)) and \
+                not module_name == "common":
+            ds_path = os.path.join(path, module_name)
+            must_mkdirs(ds_path)
+            getattr(
+                importlib.import_module("paddle.dataset.%s" % module_name),
+                "convert")(ds_path)
+
+
+def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
+    """
+    you can call the function as:
+
+    split(paddle.dataset.cifar.train10(), line_count=1000,
+        suffix="imikolov-train-%05d.pickle")
+
+    the output files as:
+
+    |-imikolov-train-00000.pickle
+    |-imikolov-train-00001.pickle
+    |- ...
+    |-imikolov-train-00480.pickle
+
+    :param reader: is a reader creator
+    :param line_count: line count for each file
+    :param suffix: the suffix for the output files, should contain "%d"
+                means the id for each file. Default is "%05d.pickle"
+    :param dumper: is a callable function that dump object to file, this
+                function will be called as dumper(obj, f) and obj is the object
+                will be dumped, f is a file object. Default is cPickle.dump.
+    """
+    if not callable(dumper):
+        raise TypeError("dumper should be callable.")
+    lines = []
+    indx_f = 0
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i >= line_count and i % line_count == 0:
+            with open(suffix % indx_f, "w") as f:
+                dumper(lines, f)
+                lines = []
+                indx_f += 1
+    if lines:
+        with open(suffix % indx_f, "w") as f:
+            dumper(lines, f)
+
+
+def cluster_files_reader(files_pattern,
+                         trainer_count,
+                         trainer_id,
+                         loader=cPickle.load):
+    """
+    Create a reader that yield element from the given files, select
+    a file set according trainer count and trainer_id
+
+    :param files_pattern: the files which generating by split(...)
+    :param trainer_count: total trainer count
+    :param trainer_id: the trainer rank id
+    :param loader: is a callable function that load object from file, this
+                function will be called as loader(f) and f is a file object.
+                Default is cPickle.load
+    """
+
+    def reader():
+        if not callable(loader):
+            raise TypeError("loader should be callable.")
+        file_list = glob.glob(files_pattern)
+        file_list.sort()
+        my_file_list = []
+        for idx, fn in enumerate(file_list):
+            if idx % trainer_count == trainer_id:
+                print "append file: %s" % fn
+                my_file_list.append(fn)
+        for fn in my_file_list:
+            with open(fn, "r") as f:
+                lines = loader(f)
+                for line in lines:
+                    yield line
+
+    return reader
+
+
+def convert(output_path, reader, line_count, name_prefix):
+    import recordio
+    """
+    Convert data from reader to recordio format files.
+
+    :param output_path: directory in which output files will be saved.
+    :param reader: a data reader, from which the convert program will read
+                   data instances.
+    :param name_prefix: the name prefix of generated files.
+    :param max_lines_to_shuffle: the max lines numbers to shuffle before
+                                 writing.
+    """
+
+    assert line_count >= 1
+    indx_f = 0
+
+    def write_data(indx_f, lines):
+        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
+        writer = recordio.writer(filename)
+        for l in lines:
+            # FIXME(Yancey1989):
+            # dumps with protocol: pickle.HIGHEST_PROTOCOL
+            writer.write(cPickle.dumps(l))
+        writer.close()
+
+    lines = []
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i % line_count == 0 and i >= line_count:
+            write_data(indx_f, lines)
+            lines = []
+            indx_f += 1
+            continue
+
+    write_data(indx_f, lines)
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e94ce89892f8e6822c15fdc510805e75dfca988
--- /dev/null
+++ b/python/paddle/dataset/conll05.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Conll05 dataset.
+Paddle semantic role labeling Book and demo use this dataset as an example.
+Because Conll05 is not free in public, the default downloaded URL is test set
+of Conll05 (which is public). Users can change URL and MD5 to their Conll
+dataset. And a pre-trained word vector model based on Wikipedia corpus is used
+to initialize SRL model.
+"""
+
+import tarfile
+import gzip
+import itertools
+import paddle.dataset.common
+
+__all__ = ['test, get_dict', 'get_embedding', 'convert']
+
+DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
+DATA_MD5 = '387719152ae52d60422c016e92a742fc'
+WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
+VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
+TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
+EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
+
+UNK_IDX = 0
+
+
+def load_label_dict(filename):
+    d = dict()
+    tag_dict = set()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            if line.startswith("B-"):
+                tag_dict.add(line[2:])
+            elif line.startswith("I-"):
+                tag_dict.add(line[2:])
+        index = 0
+        for tag in tag_dict:
+            d["B-" + tag] = index
+            index += 1
+            d["I-" + tag] = index
+            index += 1
+        d["O"] = index
+    return d
+
+
+def load_dict(filename):
+    d = dict()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            d[line.strip()] = i
+    return d
+
+
+def corpus_reader(data_path, words_name, props_name):
+    """
+    Read one corpus. It returns an iterator. Each element of
+    this iterator is a tuple including sentence and labels. The sentence is
+    consist of a list of word IDs. The labels include a list of label IDs.
+    :return: a iterator of data.
+    :rtype: iterator
+    """
+
+    def reader():
+        tf = tarfile.open(data_path)
+        wf = tf.extractfile(words_name)
+        pf = tf.extractfile(props_name)
+        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
+                fileobj=pf) as props_file:
+            sentences = []
+            labels = []
+            one_seg = []
+            for word, label in itertools.izip(words_file, props_file):
+                word = word.strip()
+                label = label.strip().split()
+
+                if len(label) == 0:  # end of sentence
+                    for i in xrange(len(one_seg[0])):
+                        a_kind_lable = [x[i] for x in one_seg]
+                        labels.append(a_kind_lable)
+
+                    if len(labels) >= 1:
+                        verb_list = []
+                        for x in labels[0]:
+                            if x != '-':
+                                verb_list.append(x)
+
+                        for i, lbl in enumerate(labels[1:]):
+                            cur_tag = 'O'
+                            is_in_bracket = False
+                            lbl_seq = []
+                            verb_word = ''
+                            for l in lbl:
+                                if l == '*' and is_in_bracket == False:
+                                    lbl_seq.append('O')
+                                elif l == '*' and is_in_bracket == True:
+                                    lbl_seq.append('I-' + cur_tag)
+                                elif l == '*)':
+                                    lbl_seq.append('I-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') != -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') == -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = True
+                                else:
+                                    raise RuntimeError('Unexpected label: %s' %
+                                                       l)
+
+                            yield sentences, verb_list[i], lbl_seq
+
+                    sentences = []
+                    labels = []
+                    one_seg = []
+                else:
+                    sentences.append(word)
+                    one_seg.append(label)
+
+        pf.close()
+        wf.close()
+        tf.close()
+
+    return reader
+
+
+def reader_creator(corpus_reader,
+                   word_dict=None,
+                   predicate_dict=None,
+                   label_dict=None):
+    def reader():
+        for sentence, predicate, labels in corpus_reader():
+
+            sen_len = len(sentence)
+
+            verb_index = labels.index('B-V')
+            mark = [0] * len(labels)
+            if verb_index > 0:
+                mark[verb_index - 1] = 1
+                ctx_n1 = sentence[verb_index - 1]
+            else:
+                ctx_n1 = 'bos'
+
+            if verb_index > 1:
+                mark[verb_index - 2] = 1
+                ctx_n2 = sentence[verb_index - 2]
+            else:
+                ctx_n2 = 'bos'
+
+            mark[verb_index] = 1
+            ctx_0 = sentence[verb_index]
+
+            if verb_index < len(labels) - 1:
+                mark[verb_index + 1] = 1
+                ctx_p1 = sentence[verb_index + 1]
+            else:
+                ctx_p1 = 'eos'
+
+            if verb_index < len(labels) - 2:
+                mark[verb_index + 2] = 1
+                ctx_p2 = sentence[verb_index + 2]
+            else:
+                ctx_p2 = 'eos'
+
+            word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
+
+            ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+            pred_idx = [predicate_dict.get(predicate)] * sen_len
+            label_idx = [label_dict.get(w) for w in labels]
+
+            yield word_idx, ctx_n2_idx, ctx_n1_idx, \
+              ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
+
+    return reader
+
+
+def get_dict():
+    """
+    Get the word, verb and label dictionary of Wikipedia corpus.
+    """
+    word_dict = load_dict(
+        paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
+    verb_dict = load_dict(
+        paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
+    label_dict = load_label_dict(
+        paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
+    return word_dict, verb_dict, label_dict
+
+
+def get_embedding():
+    """
+    Get the trained word vector based on Wikipedia corpus.
+    """
+    return paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+
+
+def test():
+    """
+    Conll05 test set creator.
+
+    Because the training dataset is not free, the test dataset is used for
+    training. It returns a reader creator, each sample in the reader is nine
+    features, including sentence sequence, predicate, predicate context,
+    predicate context flag and tagged sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    word_dict, verb_dict, label_dict = get_dict()
+    reader = corpus_reader(
+        paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
+        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
+        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
+    return reader_creator(reader, word_dict, verb_dict, label_dict)
+
+
+def fetch():
+    paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+    paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, test(), 1000, "conl105_train")
+    paddle.dataset.common.convert(path, test(), 1000, "conl105_test")
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f082e33be3357fbe405ab1a1ef5e0e601108a363
--- /dev/null
+++ b/python/paddle/dataset/flowers.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module will download dataset from
+http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
+and parse train/test set intopaddle reader creators.
+
+This set contains images of flowers belonging to 102 different categories.
+The images were acquired by searching the web and taking pictures. There are a
+minimum of 40 images for each category.
+
+The database was used in:
+
+Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
+ number of classes.Proceedings of the Indian Conference on Computer Vision,
+Graphics and Image Processing (2008)
+http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
+
+"""
+import cPickle
+import itertools
+import functools
+from common import download
+import tarfile
+import scipy.io as scio
+from paddle.dataset.image import *
+from paddle.reader import *
+import os
+import numpy as np
+from multiprocessing import cpu_count
+__all__ = ['train', 'test', 'valid']
+
+DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
+LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
+SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
+DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
+LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
+SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+# In official 'readme', tstid is the flag of test data
+# and trnid is the flag of train data. But test data is more than train data.
+# So we exchange the train data and test data.
+TRAIN_FLAG = 'tstid'
+TEST_FLAG = 'trnid'
+VALID_FLAG = 'valid'
+
+
+def default_mapper(is_train, sample):
+    '''
+    map image bytes data to type needed by model input layer
+    '''
+    img, label = sample
+    img = load_image_bytes(img)
+    img = simple_transform(
+        img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
+    return img.flatten().astype('float32'), label
+
+
+train_mapper = functools.partial(default_mapper, True)
+test_mapper = functools.partial(default_mapper, False)
+
+
+def reader_creator(data_file,
+                   label_file,
+                   setid_file,
+                   dataset_name,
+                   mapper,
+                   buffered_size=1024,
+                   use_xmap=True):
+    '''
+    1. read images from tar file and
+        merge images into batch files in 102flowers.tgz_batch/
+    2. get a reader to read sample from batch file
+
+    :param data_file: downloaded data file
+    :type data_file: string
+    :param label_file: downloaded label file
+    :type label_file: string
+    :param setid_file: downloaded setid file containing information
+                        about how to split dataset
+    :type setid_file: string
+    :param dataset_name: data set name (tstid|trnid|valid)
+    :type dataset_name: string
+    :param mapper: a function to map image bytes data to type
+                    needed by model input layer
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: data reader
+    :rtype: callable
+    '''
+    labels = scio.loadmat(label_file)['labels'][0]
+    indexes = scio.loadmat(setid_file)[dataset_name][0]
+    img2label = {}
+    for i in indexes:
+        img = "jpg/image_%05d.jpg" % i
+        img2label[img] = labels[i - 1]
+    file_list = batch_images_from_tar(data_file, dataset_name, img2label)
+
+    def reader():
+        for file in open(file_list):
+            file = file.strip()
+            batch = None
+            with open(file, 'r') as f:
+                batch = cPickle.load(f)
+            data = batch['data']
+            labels = batch['label']
+            for sample, label in itertools.izip(data, batch['label']):
+                yield sample, int(label) - 1
+
+    if use_xmap:
+        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+    else:
+        return map_readers(mapper, reader)
+
+
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers training set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: train data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers test set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers validation set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def fetch():
+    download(DATA_URL, 'flowers', DATA_MD5)
+    download(LABEL_URL, 'flowers', LABEL_MD5)
+    download(SETID_URL, 'flowers', SETID_MD5)
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..9235c41e9eb95b25a0dc53a494a203e7a4525981
--- /dev/null
+++ b/python/paddle/dataset/image.py
@@ -0,0 +1,381 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains some common interfaces for image preprocess.
+Many users are confused about the image layout. We introduce
+the image layout as follows.
+
+- CHW Layout
+
+  - The abbreviations: C=channel, H=Height, W=Width
+  - The default layout of image opened by cv2 or PIL is HWC.
+    PaddlePaddle only supports the CHW layout. And CHW is simply
+    a transpose of HWC. It must transpose the input image.
+
+- Color format: RGB or BGR
+
+  OpenCV use BGR color format. PIL use RGB color format. Both
+  formats can be used for training. Noted that, the format should
+  be keep consistent between the training and inference peroid.
+"""
+import numpy as np
+try:
+    import cv2
+except ImportError:
+    cv2 = None
+import os
+import tarfile
+import cPickle
+
+__all__ = [
+    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
+    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
+    "batch_images_from_tar"
+]
+
+
+def batch_images_from_tar(data_file,
+                          dataset_name,
+                          img2label,
+                          num_per_batch=1024):
+    """
+    Read images from tar file and batch them into batch file.
+
+    :param data_file: path of image tar file
+    :type data_file: string
+    :param dataset_name: 'train','test' or 'valid'
+    :type dataset_name: string
+    :param img2label: a dic with image file name as key 
+                    and image's label as value
+    :type img2label: dic
+    :param num_per_batch: image number per batch file
+    :type num_per_batch: int
+    :return: path of list file containing paths of batch file
+    :rtype: string
+    """
+    batch_dir = data_file + "_batch"
+    out_path = "%s/%s" % (batch_dir, dataset_name)
+    meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
+
+    if os.path.exists(out_path):
+        return meta_file
+    else:
+        os.makedirs(out_path)
+
+    tf = tarfile.open(data_file)
+    mems = tf.getmembers()
+    data = []
+    labels = []
+    file_id = 0
+    for mem in mems:
+        if mem.name in img2label:
+            data.append(tf.extractfile(mem).read())
+            labels.append(img2label[mem.name])
+            if len(data) == num_per_batch:
+                output = {}
+                output['label'] = labels
+                output['data'] = data
+                cPickle.dump(
+                    output,
+                    open('%s/batch_%d' % (out_path, file_id), 'w'),
+                    protocol=cPickle.HIGHEST_PROTOCOL)
+                file_id += 1
+                data = []
+                labels = []
+    if len(data) > 0:
+        output = {}
+        output['label'] = labels
+        output['data'] = data
+        cPickle.dump(
+            output,
+            open('%s/batch_%d' % (out_path, file_id), 'w'),
+            protocol=cPickle.HIGHEST_PROTOCOL)
+
+    with open(meta_file, 'a') as meta:
+        for file in os.listdir(out_path):
+            meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n")
+    return meta_file
+
+
+def load_image_bytes(bytes, is_color=True):
+    """
+    Load an color or gray image from bytes array.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        with open('cat.jpg') as f:
+            im = load_image_bytes(f.read())
+
+    :param bytes: the input image bytes array.
+    :type bytes: str
+    :param is_color: If set is_color True, it will load and
+                     return a color image. Otherwise, it will
+                     load and return a gray image.
+    :type is_color: bool
+    """
+    flag = 1 if is_color else 0
+    file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
+    img = cv2.imdecode(file_bytes, flag)
+    return img
+
+
+def load_image(file, is_color=True):
+    """
+    Load an color or gray image from the file path.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+
+    :param file: the input image path.
+    :type file: string
+    :param is_color: If set is_color True, it will load and
+                     return a color image. Otherwise, it will
+                     load and return a gray image.
+    :type is_color: bool
+    """
+    # cv2.IMAGE_COLOR for OpenCV3
+    # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
+    # cv2.IMAGE_GRAYSCALE for OpenCV3
+    # cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version
+    # Here, use constant 1 and 0
+    # 1: COLOR, 0: GRAYSCALE
+    flag = 1 if is_color else 0
+    im = cv2.imread(file, flag)
+    return im
+
+
+def resize_short(im, size):
+    """ 
+    Resize an image so that the length of shorter edge is size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+        im = resize_short(im, 256)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the shorter edge size of image after resizing.
+    :type size: int
+    """
+    h, w = im.shape[:2]
+    h_new, w_new = size, size
+    if h > w:
+        h_new = size * h / w
+    else:
+        w_new = size * w / h
+    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+    return im
+
+
+def to_chw(im, order=(2, 0, 1)):
+    """
+    Transpose the input image order. The image layout is HWC format
+    opened by cv2 or PIL. Transpose the input image to CHW layout
+    according the order (2,0,1).
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+        im = resize_short(im, 256)
+        im = to_chw(im)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param order: the transposed order.
+    :type order: tuple|list 
+    """
+    assert len(im.shape) == len(order)
+    im = im.transpose(order)
+    return im
+
+
+def center_crop(im, size, is_color=True):
+    """
+    Crop the center of image with size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = center_crop(im, 224)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the cropping size.
+    :type size: int
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    """
+    h, w = im.shape[:2]
+    h_start = (h - size) / 2
+    w_start = (w - size) / 2
+    h_end, w_end = h_start + size, w_start + size
+    if is_color:
+        im = im[h_start:h_end, w_start:w_end, :]
+    else:
+        im = im[h_start:h_end, w_start:w_end]
+    return im
+
+
+def random_crop(im, size, is_color=True):
+    """
+    Randomly crop input image with size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = random_crop(im, 224)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the cropping size.
+    :type size: int
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    """
+    h, w = im.shape[:2]
+    h_start = np.random.randint(0, h - size + 1)
+    w_start = np.random.randint(0, w - size + 1)
+    h_end, w_end = h_start + size, w_start + size
+    if is_color:
+        im = im[h_start:h_end, w_start:w_end, :]
+    else:
+        im = im[h_start:h_end, w_start:w_end]
+    return im
+
+
+def left_right_flip(im, is_color=True):
+    """
+    Flip an image along the horizontal direction.
+    Return the flipped image.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = left_right_flip(im)
+    
+    :param im: input image with HWC layout or HW layout for gray image
+    :type im: ndarray
+    :param is_color: whether input image is color or not
+    :type is_color: bool
+    """
+    if len(im.shape) == 3 and is_color:
+        return im[:, ::-1, :]
+    else:
+        return im[:, ::-1]
+
+
+def simple_transform(im,
+                     resize_size,
+                     crop_size,
+                     is_train,
+                     is_color=True,
+                     mean=None):
+    """
+    Simply data argumentation for training. These operations include
+    resizing, croping and flipping.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = simple_transform(im, 256, 224, True)
+
+    :param im: The input image with HWC layout.
+    :type im: ndarray
+    :param resize_size: The shorter edge length of the resized image.
+    :type resize_size: int
+    :param crop_size: The cropping size.
+    :type crop_size: int
+    :param is_train: Whether it is training or not.
+    :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
+    """
+    im = resize_short(im, resize_size)
+    if is_train:
+        im = random_crop(im, crop_size, is_color=is_color)
+        if np.random.randint(2) == 0:
+            im = left_right_flip(im, is_color)
+    else:
+        im = center_crop(im, crop_size, is_color)
+        im = center_crop(im, crop_size, is_color=is_color)
+    if len(im.shape) == 3:
+        im = to_chw(im)
+
+    im = im.astype('float32')
+    if mean is not None:
+        mean = np.array(mean, dtype=np.float32)
+        # mean value, may be one value per channel 
+        if mean.ndim == 1 and is_color:
+            mean = mean[:, np.newaxis, np.newaxis]
+        elif mean.ndim == 1:
+            mean = mean
+        else:
+            # elementwise mean
+            assert len(mean.shape) == len(im)
+        im -= mean
+
+    return im
+
+
+def load_and_transform(filename,
+                       resize_size,
+                       crop_size,
+                       is_train,
+                       is_color=True,
+                       mean=None):
+    """
+    Load image from the input file `filename` and transform image for
+    data argumentation. Please refer to the `simple_transform` interface
+    for the transform operations.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_and_transform('cat.jpg', 256, 224, True)
+
+    :param filename: The file name of input image.
+    :type filename: string
+    :param resize_size: The shorter edge length of the resized image.
+    :type resize_size: int
+    :param crop_size: The cropping size.
+    :type crop_size: int
+    :param is_train: Whether it is training or not.
+    :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
+    """
+    im = load_image(filename, is_color)
+    im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
+    return im
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ff05b1e9b7f4c42909370a21beb140ecdcd6868
--- /dev/null
+++ b/python/paddle/dataset/imdb.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+IMDB dataset.
+
+This module downloads IMDB dataset from
+http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
+of 25,000 highly polar movie reviews for training, and 25,000 for testing.
+Besides, this module also provides API for building dictionary.
+"""
+
+import paddle.dataset.common
+import collections
+import tarfile
+import re
+import string
+
+__all__ = ['build_dict', 'train', 'test', 'convert']
+
+URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
+MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
+
+
+def tokenize(pattern):
+    """
+    Read files that match the given pattern.  Tokenize and yield each file.
+    """
+
+    with tarfile.open(paddle.dataset.common.download(URL, 'imdb', MD5)) as tarf:
+        # Note that we should use tarfile.next(), which does
+        # sequential access of member files, other than
+        # tarfile.extractfile, which does random access and might
+        # destroy hard disks.
+        tf = tarf.next()
+        while tf != None:
+            if bool(pattern.match(tf.name)):
+                # newline and punctuations removal and ad-hoc tokenization.
+                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
+                    None, string.punctuation).lower().split()
+            tf = tarf.next()
+
+
+def build_dict(pattern, cutoff):
+    """
+    Build a word dictionary from the corpus. Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    word_freq = collections.defaultdict(int)
+    for doc in tokenize(pattern):
+        for word in doc:
+            word_freq[word] += 1
+
+    # Not sure if we should prune less-frequent words here.
+    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+
+    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+    words, _ = list(zip(*dictionary))
+    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx['<unk>'] = len(words)
+    return word_idx
+
+
+def reader_creator(pos_pattern, neg_pattern, word_idx):
+    UNK = word_idx['<unk>']
+    INS = []
+
+    def load(pattern, out, label):
+        for doc in tokenize(pattern):
+            out.append(([word_idx.get(w, UNK) for w in doc], label))
+
+    load(pos_pattern, INS, 0)
+    load(neg_pattern, INS, 1)
+
+    def reader():
+        for doc, label in INS:
+            yield doc, label
+
+    return reader
+
+
+def train(word_idx):
+    """
+    IMDB training set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("aclImdb/train/pos/.*\.txt$"),
+        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
+
+
+def test(word_idx):
+    """
+    IMDB test set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("aclImdb/test/pos/.*\.txt$"),
+        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
+
+
+def word_dict():
+    """
+    Build a word dictionary from the corpus.
+
+    :return: Word dictionary
+    :rtype: dict
+    """
+    return build_dict(
+        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+
+
+def fetch():
+    paddle.dataset.common.download(URL, 'imdb', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    w = word_dict()
+    paddle.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
+    paddle.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6c0a0f54373dd068b2c493f6fbc9c8578593aef
--- /dev/null
+++ b/python/paddle/dataset/imikolov.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+imikolov's simple dataset.
+
+This module will download dataset from 
+http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
+into paddle reader creators.
+"""
+import paddle.dataset.common
+import collections
+import tarfile
+
+__all__ = ['train', 'test', 'build_dict', 'convert']
+
+URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
+MD5 = '30177ea32e27c525793142b6bf2c8e2d'
+
+
+class DataType(object):
+    NGRAM = 1
+    SEQ = 2
+
+
+def word_count(f, word_freq=None):
+    if word_freq is None:
+        word_freq = collections.defaultdict(int)
+
+    for l in f:
+        for w in l.strip().split():
+            word_freq[w] += 1
+        word_freq['<s>'] += 1
+        word_freq['<e>'] += 1
+
+    return word_freq
+
+
+def build_dict(min_word_freq=50):
+    """
+    Build a word dictionary from the corpus,  Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    train_filename = './simple-examples/data/ptb.train.txt'
+    test_filename = './simple-examples/data/ptb.valid.txt'
+    with tarfile.open(
+            paddle.dataset.common.download(paddle.dataset.imikolov.URL,
+                                           'imikolov',
+                                           paddle.dataset.imikolov.MD5)) as tf:
+        trainf = tf.extractfile(train_filename)
+        testf = tf.extractfile(test_filename)
+        word_freq = word_count(testf, word_count(trainf))
+        if '<unk>' in word_freq:
+            # remove <unk> for now, since we will set it as last index
+            del word_freq['<unk>']
+
+        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
+
+        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+        words, _ = list(zip(*word_freq_sorted))
+        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx['<unk>'] = len(words)
+
+    return word_idx
+
+
+def reader_creator(filename, word_idx, n, data_type):
+    def reader():
+        with tarfile.open(
+                paddle.dataset.common.download(
+                    paddle.dataset.imikolov.URL, 'imikolov',
+                    paddle.dataset.imikolov.MD5)) as tf:
+            f = tf.extractfile(filename)
+
+            UNK = word_idx['<unk>']
+            for l in f:
+                if DataType.NGRAM == data_type:
+                    assert n > -1, 'Invalid gram length'
+                    l = ['<s>'] + l.strip().split() + ['<e>']
+                    if len(l) >= n:
+                        l = [word_idx.get(w, UNK) for w in l]
+                        for i in range(n, len(l) + 1):
+                            yield tuple(l[i - n:i])
+                elif DataType.SEQ == data_type:
+                    l = l.strip().split()
+                    l = [word_idx.get(w, UNK) for w in l]
+                    src_seq = [word_idx['<s>']] + l
+                    trg_seq = l + [word_idx['<e>']]
+                    if n > 0 and len(src_seq) > n: continue
+                    yield src_seq, trg_seq
+                else:
+                    assert False, 'Unknow data type'
+
+    return reader
+
+
+def train(word_idx, n, data_type=DataType.NGRAM):
+    """
+    imikolov training set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size if type is ngram, otherwise max length of sequence
+    :type n: int
+    :param data_type: data type (ngram or sequence)
+    :type data_type: member variable of DataType (NGRAM or SEQ)
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n,
+                          data_type)
+
+
+def test(word_idx, n, data_type=DataType.NGRAM):
+    """
+    imikolov test set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size if type is ngram, otherwise max length of sequence
+    :type n: int
+    :param data_type: data type (ngram or sequence)
+    :type data_type: member variable of DataType (NGRAM or SEQ)
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n,
+                          data_type)
+
+
+def fetch():
+    paddle.dataset.common.download(URL, "imikolov", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    N = 5
+    word_dict = build_dict()
+    paddle.dataset.common.convert(path,
+                                  train(word_dict, N), 1000, "imikolov_train")
+    paddle.dataset.common.convert(path,
+                                  test(word_dict, N), 1000, "imikolov_test")
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a1b8b5fac223c0d134cae69a61a0c2c00bc1feb
--- /dev/null
+++ b/python/paddle/dataset/mnist.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MNIST dataset.
+
+This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
+parse training set and test set into paddle reader creators.
+"""
+import paddle.dataset.common
+import subprocess
+import numpy
+import platform
+__all__ = ['train', 'test', 'convert']
+
+URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
+TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
+TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
+TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
+TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
+TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
+TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
+TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
+TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
+
+
+def reader_creator(image_filename, label_filename, buffer_size):
+    def reader():
+        if platform.system() == 'Darwin':
+            zcat_cmd = 'gzcat'
+        elif platform.system() == 'Linux':
+            zcat_cmd = 'zcat'
+        else:
+            raise NotImplementedError()
+
+        # According to http://stackoverflow.com/a/38061619/724872, we
+        # cannot use standard package gzip here.
+        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
+        m.stdout.read(16)  # skip some magic bytes
+
+        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
+        l.stdout.read(8)  # skip some magic bytes
+
+        try:  # reader could be break.
+            while True:
+                labels = numpy.fromfile(
+                    l.stdout, 'ubyte', count=buffer_size).astype("int")
+
+                if labels.size != buffer_size:
+                    break  # numpy.fromfile returns empty slice after EOF.
+
+                images = numpy.fromfile(
+                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
+                        (buffer_size, 28 * 28)).astype('float32')
+
+                images = images / 255.0 * 2.0 - 1.0
+
+                for i in xrange(buffer_size):
+                    yield images[i, :], int(labels[i])
+        finally:
+            m.terminate()
+            l.terminate()
+
+    return reader
+
+
+def train():
+    """
+    MNIST training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
+                                       TRAIN_IMAGE_MD5),
+        paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
+                                       TRAIN_LABEL_MD5), 100)
+
+
+def test():
+    """
+    MNIST test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5),
+        paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5),
+        100)
+
+
+def fetch():
+    paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
+    paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
+    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train(), 1000, "minist_train")
+    paddle.dataset.common.convert(path, test(), 1000, "minist_test")
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab11716202a8298c182e23b661eb1d2ac74bf4da
--- /dev/null
+++ b/python/paddle/dataset/movielens.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Movielens 1-M dataset.
+
+Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
+movies, which was collected by GroupLens Research. This module will download
+Movielens 1-M dataset from 
+http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
+set and test set into paddle reader creators.
+
+"""
+
+import zipfile
+import paddle.dataset.common
+import re
+import random
+import functools
+
+__all__ = [
+    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
+    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
+    'convert'
+]
+
+age_table = [1, 18, 25, 35, 45, 50, 56]
+
+URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
+MD5 = 'c4d9eecfca2ab87c1945afe126590906'
+
+
+class MovieInfo(object):
+    """
+    Movie id, title and categories information are stored in MovieInfo.
+    """
+
+    def __init__(self, index, categories, title):
+        self.index = int(index)
+        self.categories = categories
+        self.title = title
+
+    def value(self):
+        """
+        Get information from a movie.
+        """
+        return [
+            self.index, [CATEGORIES_DICT[c] for c in self.categories],
+            [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
+        ]
+
+    def __str__(self):
+        return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
+            self.index, self.title, self.categories)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class UserInfo(object):
+    """
+    User id, gender, age, and job information are stored in UserInfo.
+    """
+
+    def __init__(self, index, gender, age, job_id):
+        self.index = int(index)
+        self.is_male = gender == 'M'
+        self.age = age_table.index(int(age))
+        self.job_id = int(job_id)
+
+    def value(self):
+        """
+        Get information from a user.
+        """
+        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
+
+    def __str__(self):
+        return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
+            self.index, "M"
+            if self.is_male else "F", age_table[self.age], self.job_id)
+
+    def __repr__(self):
+        return str(self)
+
+
+MOVIE_INFO = None
+MOVIE_TITLE_DICT = None
+CATEGORIES_DICT = None
+USER_INFO = None
+
+
+def __initialize_meta_info__():
+    fn = paddle.dataset.common.download(URL, "movielens", MD5)
+    global MOVIE_INFO
+    if MOVIE_INFO is None:
+        pattern = re.compile(r'^(.*)\((\d+)\)$')
+        with zipfile.ZipFile(file=fn) as package:
+            for info in package.infolist():
+                assert isinstance(info, zipfile.ZipInfo)
+                MOVIE_INFO = dict()
+                title_word_set = set()
+                categories_set = set()
+                with package.open('ml-1m/movies.dat') as movie_file:
+                    for i, line in enumerate(movie_file):
+                        movie_id, title, categories = line.strip().split('::')
+                        categories = categories.split('|')
+                        for c in categories:
+                            categories_set.add(c)
+                        title = pattern.match(title).group(1)
+                        MOVIE_INFO[int(movie_id)] = MovieInfo(
+                            index=movie_id, categories=categories, title=title)
+                        for w in title.split():
+                            title_word_set.add(w.lower())
+
+                global MOVIE_TITLE_DICT
+                MOVIE_TITLE_DICT = dict()
+                for i, w in enumerate(title_word_set):
+                    MOVIE_TITLE_DICT[w] = i
+
+                global CATEGORIES_DICT
+                CATEGORIES_DICT = dict()
+                for i, c in enumerate(categories_set):
+                    CATEGORIES_DICT[c] = i
+
+                global USER_INFO
+                USER_INFO = dict()
+                with package.open('ml-1m/users.dat') as user_file:
+                    for line in user_file:
+                        uid, gender, age, job, _ = line.strip().split("::")
+                        USER_INFO[int(uid)] = UserInfo(
+                            index=uid, gender=gender, age=age, job_id=job)
+    return fn
+
+
+def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
+    fn = __initialize_meta_info__()
+    rand = random.Random(x=rand_seed)
+    with zipfile.ZipFile(file=fn) as package:
+        with package.open('ml-1m/ratings.dat') as rating:
+            for line in rating:
+                if (rand.random() < test_ratio) == is_test:
+                    uid, mov_id, rating, _ = line.strip().split("::")
+                    uid = int(uid)
+                    mov_id = int(mov_id)
+                    rating = float(rating) * 2 - 5.0
+
+                    mov = MOVIE_INFO[mov_id]
+                    usr = USER_INFO[uid]
+                    yield usr.value() + mov.value() + [[rating]]
+
+
+def __reader_creator__(**kwargs):
+    return lambda: __reader__(**kwargs)
+
+
+train = functools.partial(__reader_creator__, is_test=False)
+test = functools.partial(__reader_creator__, is_test=True)
+
+
+def get_movie_title_dict():
+    """
+    Get movie title dictionary.
+    """
+    __initialize_meta_info__()
+    return MOVIE_TITLE_DICT
+
+
+def __max_index_info__(a, b):
+    if a.index > b.index:
+        return a
+    else:
+        return b
+
+
+def max_movie_id():
+    """
+    Get the maximum value of movie id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
+
+
+def max_user_id():
+    """
+    Get the maximum value of user id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
+
+
+def __max_job_id_impl__(a, b):
+    if a.job_id > b.job_id:
+        return a
+    else:
+        return b
+
+
+def max_job_id():
+    """
+    Get the maximum value of job id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
+
+
+def movie_categories():
+    """
+    Get movie categoriges dictionary.
+    """
+    __initialize_meta_info__()
+    return CATEGORIES_DICT
+
+
+def user_info():
+    """
+    Get user info dictionary.
+    """
+    __initialize_meta_info__()
+    return USER_INFO
+
+
+def movie_info():
+    """
+    Get movie info dictionary.
+    """
+    __initialize_meta_info__()
+    return MOVIE_INFO
+
+
+def unittest():
+    for train_count, _ in enumerate(train()()):
+        pass
+    for test_count, _ in enumerate(test()()):
+        pass
+
+    print train_count, test_count
+
+
+def fetch():
+    paddle.dataset.common.download(URL, "movielens", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train(), 1000, "movielens_train")
+    paddle.dataset.common.convert(path, test(), 1000, "movielens_test")
+
+
+if __name__ == '__main__':
+    unittest()
diff --git a/python/paddle/dataset/mq2007.py b/python/paddle/dataset/mq2007.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3b3dd524c34be660c5f2d4fc5ce2fa0420efbc1
--- /dev/null
+++ b/python/paddle/dataset/mq2007.py
@@ -0,0 +1,333 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MQ2007 dataset
+
+MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross
+validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set,
+validation set and testing set.
+
+MQ2007 dataset from website
+http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators
+
+"""
+
+import os
+import functools
+import rarfile
+from common import download
+import numpy as np
+
+# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
+URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar"
+MD5 = "7be1640ae95c6408dab0ae7207bdc706"
+
+
+def __initialize_meta_info__():
+    """
+  download and extract the MQ2007 dataset
+  """
+    fn = fetch()
+    rar = rarfile.RarFile(fn)
+    dirpath = os.path.dirname(fn)
+    rar.extractall(path=dirpath)
+    return dirpath
+
+
+class Query(object):
+    """
+  queries used for learning to rank algorithms. It is created from relevance scores,  query-document feature vectors
+
+  Parameters:
+  ----------
+  query_id : int
+    query_id in dataset, mapping from query to relevance documents
+  relevance_score : int 
+    relevance score of query and document pair
+  feature_vector : array, dense feature
+    feature in vector format
+  description : string
+    comment section in query doc pair data
+  """
+
+    def __init__(self,
+                 query_id=-1,
+                 relevance_score=-1,
+                 feature_vector=None,
+                 description=""):
+        self.query_id = query_id
+        self.relevance_score = relevance_score
+        if feature_vector is None:
+            self.feature_vector = []
+        else:
+            self.feature_vector = feature_vector
+        self.description = description
+
+    def __str__(self):
+        string = "%s %s %s" % (str(self.relevance_score), str(self.query_id),
+                               " ".join(str(f) for f in self.feature_vector))
+        return string
+
+    # @classmethod
+    def _parse_(self, text):
+        """
+    parse line into Query
+    """
+        comment_position = text.find('#')
+        line = text[:comment_position].strip()
+        self.description = text[comment_position + 1:].strip()
+        parts = line.split()
+        if len(parts) != 48:
+            sys.stdout.write("expect 48 space split parts, get %d" %
+                             (len(parts)))
+            return None
+        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
+        self.relevance_score = int(parts[0])
+        self.query_id = int(parts[1].split(':')[1])
+        for p in parts[2:]:
+            pair = p.split(':')
+            self.feature_vector.append(float(pair[1]))
+        return self
+
+
+class QueryList(object):
+    """
+  group query into list, every item in list is a Query
+  """
+
+    def __init__(self, querylist=None):
+        self.query_id = -1
+        if querylist is None:
+            self.querylist = []
+        else:
+            self.querylist = querylist
+            for query in self.querylist:
+                if self.query_id == -1:
+                    self.query_id = query.query_id
+                else:
+                    if self.query_id != query.query_id:
+                        raise ValueError("query in list must be same query_id")
+
+    def __iter__(self):
+        for query in self.querylist:
+            yield query
+
+    def __len__(self):
+        return len(self.querylist)
+
+    def __getitem__(self, i):
+        return self.querylist[i]
+
+    def _correct_ranking_(self):
+        if self.querylist is None:
+            return
+        self.querylist.sort(key=lambda x: x.relevance_score, reverse=True)
+
+    def _add_query(self, query):
+        if self.query_id == -1:
+            self.query_id = query.query_id
+        else:
+            if self.query_id != query.query_id:
+                raise ValueError("query in list must be same query_id")
+        self.querylist.append(query)
+
+
+def gen_plain_txt(querylist):
+    """
+  gen plain text in list for other usage
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  query_id : np.array, shape=(samples_num, )
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+    """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield querylist.query_id, query.relevance_score, np.array(
+            query.feature_vector)
+
+
+def gen_point(querylist):
+    """
+  gen item in list for point-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield query.relevance_score, np.array(query.feature_vector)
+
+
+def gen_pair(querylist, partial_order="full"):
+    """
+  gen pair for pair-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+  pairtial_order : "full" or "neighbour"
+    there is redudant in all possiable pair combinations, which can be simplifed
+  gen pairs for neighbour items or the full partial order pairs
+
+  return :
+  ------
+  label : np.array, shape=(1)
+  query_left : np.array, shape=(1, feature_dimension)
+  query_right : same as left
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    labels = []
+    docpairs = []
+
+    # C(n,2)
+    for i in range(len(querylist)):
+        query_left = querylist[i]
+        for j in range(i + 1, len(querylist)):
+            query_right = querylist[j]
+            if query_left.relevance_score > query_right.relevance_score:
+                labels.append([1])
+                docpairs.append([
+                    np.array(query_left.feature_vector),
+                    np.array(query_right.feature_vector)
+                ])
+            elif query_left.relevance_score < query_right.relevance_score:
+                labels.append([1])
+                docpairs.append([
+                    np.array(query_right.feature_vector),
+                    np.array(query_left.feature_vector)
+                ])
+    for label, pair in zip(labels, docpairs):
+        yield np.array(label), pair[0], pair[1]
+
+
+def gen_list(querylist):
+    """
+  gen item in list for list-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    relevance_score_list = [[query.relevance_score] for query in querylist]
+    feature_vector_list = [query.feature_vector for query in querylist]
+    yield np.array(relevance_score_list), np.array(feature_vector_list)
+
+
+def query_filter(querylists):
+    """
+    filter query get only document with label 0.
+    label 0, 1, 2 means the relevance score document with query
+    parameters :
+      querylist : QueyList list
+
+    return :
+      querylist : QueyList list
+    """
+    filter_query = []
+    for querylist in querylists:
+        relevance_score_list = [query.relevance_score for query in querylist]
+        if sum(relevance_score_list) != .0:
+            filter_query.append(querylist)
+    return filter_query
+
+
+def load_from_text(filepath, shuffle=False, fill_missing=-1):
+    """
+  parse data file into querys
+  """
+    prev_query_id = -1
+    querylists = []
+    querylist = None
+    fn = __initialize_meta_info__()
+    with open(os.path.join(fn, filepath)) as f:
+        for line in f:
+            query = Query()
+            query = query._parse_(line)
+            if query == None:
+                continue
+            if query.query_id != prev_query_id:
+                if querylist is not None:
+                    querylists.append(querylist)
+                querylist = QueryList()
+                prev_query_id = query.query_id
+            querylist._add_query(query)
+    if querylist is not None:
+        querylists.append(querylist)
+    return querylists
+
+
+def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
+    """
+  Parameters
+  --------
+  filename : string
+  fill_missing : fill the missing value. default in MQ2007 is -1
+  
+  Returns
+  ------
+  yield
+    label query_left, query_right  # format = "pairwise"
+    label querylist # format = "listwise"
+  """
+    querylists = query_filter(
+        load_from_text(
+            filepath, shuffle=shuffle, fill_missing=fill_missing))
+    for querylist in querylists:
+        if format == "plain_txt":
+            yield next(gen_plain_txt(querylist))
+        elif format == "pointwise":
+            yield next(gen_point(querylist))
+        elif format == "pairwise":
+            for pair in gen_pair(querylist):
+                yield pair
+        elif format == "listwise":
+            yield next(gen_list(querylist))
+
+
+train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt")
+test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt")
+
+
+def fetch():
+    return download(URL, "MQ2007", MD5)
+
+
+if __name__ == "__main__":
+    fetch()
+    mytest = functools.partial(
+        __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
+    for label, query in mytest():
+        print label, query
diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5461164fe6b816356e42fc7b7dcf388eccfadfb
--- /dev/null
+++ b/python/paddle/dataset/sentiment.py
@@ -0,0 +1,140 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The script fetch and preprocess movie_reviews data set that provided by NLTK
+
+TODO(yuyang18): Complete dataset.
+"""
+
+import collections
+from itertools import chain
+
+import nltk
+from nltk.corpus import movie_reviews
+
+import paddle.dataset.common
+
+__all__ = ['train', 'test', 'get_word_dict', 'convert']
+NUM_TRAINING_INSTANCES = 1600
+NUM_TOTAL_INSTANCES = 2000
+
+
+def download_data_if_not_yet():
+    """
+    Download the data set, if the data set is not download.
+    """
+    try:
+        # make sure that nltk can find the data
+        if paddle.dataset.common.DATA_HOME not in nltk.data.path:
+            nltk.data.path.append(paddle.dataset.common.DATA_HOME)
+        movie_reviews.categories()
+    except LookupError:
+        print "Downloading movie_reviews data set, please wait....."
+        nltk.download(
+            'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
+        print "Download data set success....."
+        print "Path is " + nltk.data.find('corpora/movie_reviews').path
+
+
+def get_word_dict():
+    """
+    Sorted the words by the frequency of words which occur in sample
+    :return:
+        words_freq_sorted
+    """
+    words_freq_sorted = list()
+    word_freq_dict = collections.defaultdict(int)
+    download_data_if_not_yet()
+
+    for category in movie_reviews.categories():
+        for field in movie_reviews.fileids(category):
+            for words in movie_reviews.words(field):
+                word_freq_dict[words] += 1
+    words_sort_list = word_freq_dict.items()
+    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
+    for index, word in enumerate(words_sort_list):
+        words_freq_sorted.append((word[0], index))
+    return words_freq_sorted
+
+
+def sort_files():
+    """
+    Sorted the sample for cross reading the sample
+    :return:
+        files_list
+    """
+    files_list = list()
+    neg_file_list = movie_reviews.fileids('neg')
+    pos_file_list = movie_reviews.fileids('pos')
+    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
+    return files_list
+
+
+def load_sentiment_data():
+    """
+    Load the data set
+    :return:
+        data_set
+    """
+    data_set = list()
+    download_data_if_not_yet()
+    words_ids = dict(get_word_dict())
+    for sample_file in sort_files():
+        words_list = list()
+        category = 0 if 'neg' in sample_file else 1
+        for word in movie_reviews.words(sample_file):
+            words_list.append(words_ids[word.lower()])
+        data_set.append((words_list, category))
+    return data_set
+
+
+def reader_creator(data):
+    """
+    Reader creator, generate an iterator for data set
+    :param data:
+        train data set or test data set
+    """
+    for each in data:
+        yield each[0], each[1]
+
+
+def train():
+    """
+    Default training set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
+
+
+def test():
+    """
+    Default test set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
+
+
+def fetch():
+    nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train, 1000, "sentiment_train")
+    paddle.dataset.common.convert(path, test, 1000, "sentiment_test")
diff --git a/python/paddle/dataset/tests/CMakeLists.txt b/python/paddle/dataset/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..485c38a13b573664d8033c237272a10ebb7c9701
--- /dev/null
+++ b/python/paddle/dataset/tests/CMakeLists.txt
@@ -0,0 +1 @@
+py_test(test_image SRCS test_image.py)
diff --git a/python/paddle/dataset/tests/cat.jpg b/python/paddle/dataset/tests/cat.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bc1fbbd371216b9904b522ed302700c79d2e4876
Binary files /dev/null and b/python/paddle/dataset/tests/cat.jpg differ
diff --git a/python/paddle/dataset/tests/cifar_test.py b/python/paddle/dataset/tests/cifar_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..839125b09dd5c6432e3572374a7345a77a43f7cf
--- /dev/null
+++ b/python/paddle/dataset/tests/cifar_test.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.cifar
+import unittest
+
+
+class TestCIFAR(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3072)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_test10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.cifar.test10())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_train10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.cifar.train10())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.cifar.test100())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 99)
+
+    def test_train100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.cifar.train100())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 99)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/common_test.py b/python/paddle/dataset/tests/common_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7cc02aa83061599ffefa18de6cb02ac0fc9e9b7
--- /dev/null
+++ b/python/paddle/dataset/tests/common_test.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.common
+import unittest
+import tempfile
+import glob
+
+
+class TestCommon(unittest.TestCase):
+    def test_md5file(self):
+        _, temp_path = tempfile.mkstemp()
+        with open(temp_path, 'w') as f:
+            f.write("Hello\n")
+        self.assertEqual('09f7e02f1290be211da707a266f153b3',
+                         paddle.dataset.common.md5file(temp_path))
+
+    def test_download(self):
+        yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
+        self.assertEqual(
+            paddle.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
+            paddle.dataset.common.download(yi_avatar, 'test',
+                                           'f75287202d6622414c706c36c16f8e0d'))
+
+    def test_split(self):
+        def test_reader():
+            def reader():
+                for x in xrange(10):
+                    yield x
+
+            return reader
+
+        _, temp_path = tempfile.mkstemp()
+        paddle.dataset.common.split(
+            test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
+        files = glob.glob(temp_path + '/test-%05d.pickle')
+        self.assertEqual(len(files), 3)
+
+    def test_cluster_file_reader(self):
+        _, temp_path = tempfile.mkstemp()
+        for x in xrange(5):
+            with open(temp_path + '/%05d.test' % x) as f:
+                f.write('%d\n' % x)
+        reader = paddle.dataset.common.cluster_files_reader(
+            temp_path + '/*.test', 5, 0)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str("0"))
+
+    def test_convert(self):
+        record_num = 10
+        num_shards = 4
+
+        def test_reader():
+            def reader():
+                for x in xrange(record_num):
+                    yield x
+
+            return reader
+
+        path = tempfile.mkdtemp()
+        paddle.dataset.common.convert(path,
+                                      test_reader(), num_shards,
+                                      'random_images')
+
+        files = glob.glob(path + '/random_images-*')
+        self.assertEqual(len(files), num_shards)
+
+        recs = []
+        for i in range(0, num_shards):
+            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
+            r = recordio.reader(n)
+            while True:
+                d = r.read()
+                if d is None:
+                    break
+                recs.append(d)
+
+        recs.sort()
+        self.assertEqual(total, record_num)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/flowers_test.py b/python/paddle/dataset/tests/flowers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..06260fd796ce0271b7cec2f42a8a5a255a02dc24
--- /dev/null
+++ b/python/paddle/dataset/tests/flowers_test.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.flowers
+import unittest
+
+
+class TestFlowers(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        size = 224 * 224 * 3
+        for l in reader():
+            self.assertEqual(l[0].size, size)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.flowers.train())
+        self.assertEqual(instances, 6149)
+        self.assertEqual(max_label_value, 102)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.flowers.test())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
+
+    def test_valid(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.flowers.valid())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/imdb_test.py b/python/paddle/dataset/tests/imdb_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..539da049449cd273db0a9e260851ed40e1be0f04
--- /dev/null
+++ b/python/paddle/dataset/tests/imdb_test.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.imdb
+import unittest
+import re
+
+TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
+TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
+TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
+
+TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
+TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
+TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
+
+
+class TestIMDB(unittest.TestCase):
+    word_idx = None
+
+    def test_build_dict(self):
+        if self.word_idx == None:
+            self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150)
+
+        self.assertEqual(len(self.word_idx), 7036)
+
+    def check_dataset(self, dataset, expected_size):
+        if self.word_idx == None:
+            self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150)
+
+        sum = 0
+        for l in dataset(self.word_idx):
+            self.assertEqual(l[1], sum % 2)
+            sum += 1
+        self.assertEqual(sum, expected_size)
+
+    def test_train(self):
+        self.check_dataset(paddle.dataset.imdb.train, 25000)
+
+    def test_test(self):
+        self.check_dataset(paddle.dataset.imdb.test, 25000)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/imikolov_test.py b/python/paddle/dataset/tests/imikolov_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..233fd9fc8cea4cd0b5cd052580030fc8c993693c
--- /dev/null
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.imikolov
+import unittest
+
+WORD_DICT = paddle.dataset.imikolov.build_dict()
+
+
+class TestMikolov(unittest.TestCase):
+    def check_reader(self, reader, n):
+        for l in reader():
+            self.assertEqual(len(l), n)
+
+    def test_train(self):
+        n = 5
+        self.check_reader(paddle.dataset.imikolov.train(WORD_DICT, n), n)
+
+        first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\
+            'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\
+            'rake regatta rubens sim snack-food ssangyong swapo wachter'
+        first_line = [
+            WORD_DICT.get(ch, WORD_DICT['<unk>'])
+            for ch in first_line.split(' ')
+        ]
+        for l in paddle.dataset.imikolov.train(
+                WORD_DICT, n=-1,
+                data_type=paddle.dataset.imikolov.DataType.SEQ)():
+            read_line = l[0][1:]
+            break
+        self.assertEqual(first_line, read_line)
+
+    def test_test(self):
+        n = 5
+        self.check_reader(paddle.dataset.imikolov.test(WORD_DICT, n), n)
+
+        first_line = 'consumers may want to move their telephones a little '\
+                'closer to the tv set'
+        first_line = [
+            WORD_DICT.get(ch, WORD_DICT['<unk>'])
+            for ch in first_line.split(' ')
+        ]
+        for l in paddle.dataset.imikolov.test(
+                WORD_DICT, n=-1,
+                data_type=paddle.dataset.imikolov.DataType.SEQ)():
+            read_line = l[0][1:]
+            break
+        self.assertEqual(first_line, read_line)
+
+    def test_total(self):
+        _, idx = zip(*WORD_DICT.items())
+        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/mnist_test.py b/python/paddle/dataset/tests/mnist_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ada19d3f2ee13e194d08e19a4b86b558c69a0a7
--- /dev/null
+++ b/python/paddle/dataset/tests/mnist_test.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.mnist
+import unittest
+
+
+class TestMNIST(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 784)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.mnist.train())
+        self.assertEqual(instances, 60000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.mnist.test())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/mq2007_test.py b/python/paddle/dataset/tests/mq2007_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba388724a8e84591df7150b41f8ea39a850fc31
--- /dev/null
+++ b/python/paddle/dataset/tests/mq2007_test.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.mq2007
+import unittest
+
+
+class TestMQ2007(unittest.TestCase):
+    def test_pairwise(self):
+        for label, query_left, query_right in paddle.dataset.mq2007.test(
+                format="pairwise"):
+            self.assertEqual(query_left.shape(), (46, ))
+            self.assertEqual(query_right.shape(), (46, ))
+
+    def test_listwise(self):
+        for label_array, query_array in paddle.dataset.mq2007.test(
+                format="listwise"):
+            self.assertEqual(len(label_array), len(query_array))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/dataset/tests/test_image.py b/python/paddle/dataset/tests/test_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bd56607ae1998935a3b3aaa0e3279515c2a540c
--- /dev/null
+++ b/python/paddle/dataset/tests/test_image.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.dataset.image as image
+
+
+class Image(unittest.TestCase):
+    def test_resize_flip_chw(self):
+        # resize
+        im = image.load_image('cat.jpg')
+        im = image.resize_short(im, 256)
+        self.assertEqual(256, min(im.shape[:2]))
+        self.assertEqual(3, im.shape[2])
+
+        # flip
+        im = image.left_right_flip(im)
+        im2 = np.flip(im, 1)
+        self.assertEqual(im.all(), im2.all())
+
+        # to_chw
+        h, w, c = im.shape
+        im = image.to_chw(im)
+        self.assertEqual(c, im.shape[0])
+        self.assertEqual(h, im.shape[1])
+        self.assertEqual(w, im.shape[2])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..543f4b7378b583ea3857bf785cf330c43e535c2a
--- /dev/null
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -0,0 +1,55 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import nltk
+import paddle.dataset.sentiment as st
+from nltk.corpus import movie_reviews
+
+
+class TestSentimentMethods(unittest.TestCase):
+    def test_get_word_dict(self):
+        word_dict = st.get_word_dict()[0:10]
+        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
+                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
+                          (u'is', 8), (u'in', 9)]
+        for idx, each in enumerate(word_dict):
+            self.assertEqual(each, test_word_list[idx])
+        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
+
+    def test_sort_files(self):
+        last_label = ''
+        for sample_file in st.sort_files():
+            current_label = sample_file.split("/")[0]
+            self.assertNotEqual(current_label, last_label)
+            last_label = current_label
+
+    def test_data_set(self):
+        data_set = st.load_sentiment_data()
+        last_label = -1
+        for each in st.test():
+            self.assertNotEqual(each[1], last_label)
+            last_label = each[1]
+        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
+        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
+        self.assertEqual(
+            len(list(st.test())),
+            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/voc2012_test.py b/python/paddle/dataset/tests/voc2012_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d285461a8ae8a9cc69fbec0dcf5efc106b594f0
--- /dev/null
+++ b/python/paddle/dataset/tests/voc2012_test.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.voc2012
+import unittest
+
+
+class TestVOC(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3 * l[1].size)
+            sum += 1
+        return sum
+
+    def test_train(self):
+        count = self.check_reader(paddle.dataset.voc_seg.train())
+        self.assertEqual(count, 2913)
+
+    def test_test(self):
+        count = self.check_reader(paddle.dataset.voc_seg.test())
+        self.assertEqual(count, 1464)
+
+    def test_val(self):
+        count = self.check_reader(paddle.dataset.voc_seg.val())
+        self.assertEqual(count, 1449)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/wmt16_test.py b/python/paddle/dataset/tests/wmt16_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b949d8bf5212d51016a33da322095bde2038200
--- /dev/null
+++ b/python/paddle/dataset/tests/wmt16_test.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.wmt16
+import unittest
+
+
+class TestWMT16(unittest.TestCase):
+    def checkout_one_sample(self, sample):
+        # train data has 3 field: source language word indices,
+        # target language word indices, and target next word indices.
+        self.assertEqual(len(sample), 3)
+
+        # test start mark and end mark in source word indices.
+        self.assertEqual(sample[0][0], 0)
+        self.assertEqual(sample[0][-1], 1)
+
+        # test start mask in target word indices
+        self.assertEqual(sample[1][0], 0)
+
+        # test en mask in target next word indices
+        self.assertEqual(sample[2][-1], 1)
+
+    def test_train(self):
+        for idx, sample in enumerate(
+                paddle.dataset.wmt16.train(
+                    src_dict_size=100000, trg_dict_size=100000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_test(self):
+        for idx, sample in enumerate(
+                paddle.dataset.wmt16.test(
+                    src_dict_size=1000, trg_dict_size=1000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_val(self):
+        for idx, sample in enumerate(
+                paddle.dataset.wmt16.validation(
+                    src_dict_size=1000, trg_dict_size=1000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_get_dict(self):
+        dict_size = 1000
+        word_dict = paddle.dataset.wmt16.get_dict("en", dict_size, True)
+        self.assertEqual(len(word_dict), dict_size)
+        self.assertEqual(word_dict[0], "<s>")
+        self.assertEqual(word_dict[1], "<e>")
+        self.assertEqual(word_dict[2], "<unk>")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a56e9d5563c76ab6f524ccea9191693dc227010
--- /dev/null
+++ b/python/paddle/dataset/uci_housing.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+UCI Housing dataset.
+
+This module will download dataset from
+https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
+parse training set and test set into paddle reader creators.
+"""
+
+import numpy as np
+import os
+import paddle.dataset.common
+
+__all__ = ['train', 'test']
+
+URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
+MD5 = 'd4accdce7a25600298819f8e28e8d593'
+feature_names = [
+    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
+    'PTRATIO', 'B', 'LSTAT', 'convert'
+]
+
+UCI_TRAIN_DATA = None
+UCI_TEST_DATA = None
+URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
+MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
+
+
+def feature_range(maximums, minimums):
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots()
+    feature_num = len(maximums)
+    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
+    ax.set_title('feature scale')
+    plt.xticks(range(feature_num), feature_names)
+    plt.xlim([-1, feature_num])
+    fig.set_figheight(6)
+    fig.set_figwidth(10)
+    if not os.path.exists('./image'):
+        os.makedirs('./image')
+    fig.savefig('image/ranges.png', dpi=48)
+    plt.close(fig)
+
+
+def load_data(filename, feature_num=14, ratio=0.8):
+    global UCI_TRAIN_DATA, UCI_TEST_DATA
+    if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
+        return
+
+    data = np.fromfile(filename, sep=' ')
+    data = data.reshape(data.shape[0] / feature_num, feature_num)
+    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
+        axis=0) / data.shape[0]
+    feature_range(maximums[:-1], minimums[:-1])
+    for i in xrange(feature_num - 1):
+        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
+    offset = int(data.shape[0] * ratio)
+    UCI_TRAIN_DATA = data[:offset]
+    UCI_TEST_DATA = data[offset:]
+
+
+def train():
+    """
+    UCI_HOUSING training set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    global UCI_TRAIN_DATA
+    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TRAIN_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def test():
+    """
+    UCI_HOUSING test set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
+    global UCI_TEST_DATA
+    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TEST_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def fetch():
+    paddle.dataset.common.download(URL, 'uci_housing', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train(), 1000, "uci_housing_train")
+    paddle.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
diff --git a/python/paddle/dataset/voc2012.py b/python/paddle/dataset/voc2012.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c945574dbcc15f5cee370206ed7e70ba8ab5014
--- /dev/null
+++ b/python/paddle/dataset/voc2012.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image dataset for segmentation.
+The 2012 dataset contains images from 2008-2011 for which additional
+segmentations have been prepared. As in previous years the assignment
+to training/test sets has been maintained. The total number of images
+with segmentation has been increased from 7,062 to 9,993.
+"""
+
+import tarfile
+import io
+import numpy as np
+from paddle.dataset.common import download
+from paddle.dataset.image import *
+from PIL import Image
+
+__all__ = ['train', 'test', 'val']
+
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
+VOCtrainval_11-May-2012.tar'
+
+VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
+SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
+DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
+LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
+
+CACHE_DIR = 'voc2012'
+
+
+def reader_creator(filename, sub_name):
+
+    tarobject = tarfile.open(filename)
+    name2mem = {}
+    for ele in tarobject.getmembers():
+        name2mem[ele.name] = ele
+
+    def reader():
+        set_file = SET_FILE.format(sub_name)
+        sets = tarobject.extractfile(name2mem[set_file])
+        for line in sets:
+            line = line.strip()
+            data_file = DATA_FILE.format(line)
+            label_file = LABEL_FILE.format(line)
+            data = tarobject.extractfile(name2mem[data_file]).read()
+            label = tarobject.extractfile(name2mem[label_file]).read()
+            data = Image.open(io.BytesIO(data))
+            label = Image.open(io.BytesIO(label))
+            data = np.array(data)
+            label = np.array(label)
+            yield data, label
+
+    return reader
+
+
+def train():
+    """
+    Create a train dataset reader containing 2913 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
+
+
+def test():
+    """
+    Create a test dataset reader containing 1464 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
+
+
+def val():
+    """
+    Create a val dataset reader containing 1449 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0908c737874fa7335cca5b5f0cba83190c9f90f
--- /dev/null
+++ b/python/paddle/dataset/wmt14.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+WMT14 dataset.
+The original WMT14 dataset is too large and a small set of data for set is
+provided. This module will download dataset from
+http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
+parse training set and test set into paddle reader creators.
+
+"""
+import tarfile
+import gzip
+
+import paddle.dataset.common
+
+__all__ = [
+    'train',
+    'test',
+    'get_dict',
+    'convert',
+]
+
+URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
+                'cslm_joint_paper/data/dev+test.tgz')
+MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
+# this is a small set of data for test. The original data is too large and
+# will be add later.
+URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
+             'wmt_shrinked_data/wmt14.tgz')
+MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
+# BLEU of this trained model is 26.92
+URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
+
+START = "<s>"
+END = "<e>"
+UNK = "<unk>"
+UNK_IDX = 2
+
+
+def __read_to_dict(tar_file, dict_size):
+    def __to_dict(fd, size):
+        out_dict = dict()
+        for line_count, line in enumerate(fd):
+            if line_count < size:
+                out_dict[line.strip()] = line_count
+            else:
+                break
+        return out_dict
+
+    with tarfile.open(tar_file, mode='r') as f:
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("src.dict")
+        ]
+        assert len(names) == 1
+        src_dict = __to_dict(f.extractfile(names[0]), dict_size)
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("trg.dict")
+        ]
+        assert len(names) == 1
+        trg_dict = __to_dict(f.extractfile(names[0]), dict_size)
+        return src_dict, trg_dict
+
+
+def reader_creator(tar_file, file_name, dict_size):
+    def reader():
+        src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
+        with tarfile.open(tar_file, mode='r') as f:
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith(file_name)
+            ]
+            for name in names:
+                for line in f.extractfile(name):
+                    line_split = line.strip().split('\t')
+                    if len(line_split) != 2:
+                        continue
+                    src_seq = line_split[0]  # one source sequence
+                    src_words = src_seq.split()
+                    src_ids = [
+                        src_dict.get(w, UNK_IDX)
+                        for w in [START] + src_words + [END]
+                    ]
+
+                    trg_seq = line_split[1]  # one target sequence
+                    trg_words = trg_seq.split()
+                    trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
+
+                    # remove sequence whose length > 80 in training mode
+                    if len(src_ids) > 80 or len(trg_ids) > 80:
+                        continue
+                    trg_ids_next = trg_ids + [trg_dict[END]]
+                    trg_ids = [trg_dict[START]] + trg_ids
+
+                    yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(dict_size):
+    """
+    WMT14 training set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'train/train', dict_size)
+
+
+def test(dict_size):
+    """
+    WMT14 test set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'test/test', dict_size)
+
+
+def gen(dict_size):
+    return reader_creator(
+        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'gen/gen', dict_size)
+
+
+def get_dict(dict_size, reverse=True):
+    # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
+    # else reverse = true, return dict = {'001':'a', '002':'b', ...}
+    tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
+    if reverse:
+        src_dict = {v: k for k, v in src_dict.items()}
+        trg_dict = {v: k for k, v in trg_dict.items()}
+    return src_dict, trg_dict
+
+
+def fetch():
+    paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    paddle.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    dict_size = 30000
+    paddle.dataset.common.convert(path, train(dict_size), 1000, "wmt14_train")
+    paddle.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad23338a96df6856c7e15cb5e3bb713021a55bf0
--- /dev/null
+++ b/python/paddle/dataset/wmt16.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ACL2016 Multimodal Machine Translation. Please see this website for more
+details: http://www.statmt.org/wmt16/multimodal-task.html#task1
+
+If you use the dataset created for your task, please cite the following paper:
+Multi30K: Multilingual English-German Image Descriptions.
+
+@article{elliott-EtAl:2016:VL16,
+ author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
+ title     = {Multi30K: Multilingual English-German Image Descriptions},
+ booktitle = {Proceedings of the 6th Workshop on Vision and Language},
+ year      = {2016},
+ pages     = {70--74},
+ year      = 2016
+}
+"""
+
+import os
+import tarfile
+import gzip
+from collections import defaultdict
+
+import paddle.dataset.common
+
+__all__ = [
+    "train",
+    "test",
+    "validation",
+    "convert",
+    "fetch",
+    "get_dict",
+]
+
+DATA_URL = ("http://cloud.dlnel.org/filepub/"
+            "?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed")
+DATA_MD5 = "0c38be43600334966403524a40dcd81e"
+
+TOTAL_EN_WORDS = 11250
+TOTAL_DE_WORDS = 19220
+
+START_MARK = "<s>"
+END_MARK = "<e>"
+UNK_MARK = "<unk>"
+
+
+def __build_dict(tar_file, dict_size, save_path, lang):
+    word_dict = defaultdict(int)
+    with tarfile.open(tar_file, mode="r") as f:
+        for line in f.extractfile("wmt16/train"):
+            line_split = line.strip().split("\t")
+            if len(line_split) != 2: continue
+            sen = line_split[0] if lang == "en" else line_split[1]
+            for w in sen.split():
+                word_dict[w] += 1
+
+    with open(save_path, "w") as fout:
+        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
+        for idx, word in enumerate(
+                sorted(
+                    word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
+            if idx + 3 == dict_size: break
+            fout.write("%s\n" % (word[0]))
+
+
+def __load_dict(tar_file, dict_size, lang, reverse=False):
+    dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "wmt16/%s_%d.dict" % (lang, dict_size))
+    if not os.path.exists(dict_path) or (
+            len(open(dict_path, "r").readlines()) != dict_size):
+        __build_dict(tar_file, dict_size, dict_path, lang)
+
+    word_dict = {}
+    with open(dict_path, "r") as fdict:
+        for idx, line in enumerate(fdict):
+            if reverse:
+                word_dict[idx] = line.strip()
+            else:
+                word_dict[line.strip()] = idx
+    return word_dict
+
+
+def __get_dict_size(src_dict_size, trg_dict_size, src_lang):
+    src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else
+                                        TOTAL_DE_WORDS))
+    trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else
+                                        TOTAL_ENG_WORDS))
+    return src_dict_size, trg_dict_size
+
+
+def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
+    def reader():
+        src_dict = __load_dict(tar_file, src_dict_size, src_lang)
+        trg_dict = __load_dict(tar_file, trg_dict_size,
+                               ("de" if src_lang == "en" else "en"))
+
+        # the indice for start mark, end mark, and unk are the same in source
+        # language and target language. Here uses the source language
+        # dictionary to determine their indices.
+        start_id = src_dict[START_MARK]
+        end_id = src_dict[END_MARK]
+        unk_id = src_dict[UNK_MARK]
+
+        src_col = 0 if src_lang == "en" else 1
+        trg_col = 1 - src_col
+
+        with tarfile.open(tar_file, mode="r") as f:
+            for line in f.extractfile(file_name):
+                line_split = line.strip().split("\t")
+                if len(line_split) != 2:
+                    continue
+                src_words = line_split[src_col].split()
+                src_ids = [start_id] + [
+                    src_dict.get(w, unk_id) for w in src_words
+                ] + [end_id]
+
+                trg_words = line_split[trg_col].split()
+                trg_ids = [trg_dict.get(w, unk_id) for w in trg_words]
+
+                trg_ids_next = trg_ids + [end_id]
+                trg_ids = [start_id] + trg_ids
+
+                yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 train set reader.
+
+    This function returns the reader for train data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+
+    NOTE:
+    The original like for training data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The train reader.
+    """
+
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type.  Only support: "
+                         "en (for English); de(for Germany).")
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                "wmt16.tar.gz"),
+        file_name="wmt16/train",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def test(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 test set reader.
+
+    This function returns the reader for test data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+    NOTE:
+    The original like for test data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The test reader.
+    """
+
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
+
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                "wmt16.tar.gz"),
+        file_name="wmt16/test",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def validation(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 validation set reader.
+
+    This function returns the reader for validation data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+    NOTE:
+    The original like for validation data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The validation reader.
+    """
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                "wmt16.tar.gz"),
+        file_name="wmt16/val",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def get_dict(lang, dict_size, reverse=False):
+    """
+    return the word dictionary for the specified language.
+
+    Args:
+        lang(string): A string indicating which language is the source
+                      language. Available options are: "en" for English
+                      and "de" for Germany.
+        dict_size(int): Size of the specified language dictionary.
+        reverse(bool): If reverse is set to False, the returned python
+                       dictionary will use word as key and use index as value.
+                       If reverse is set to True, the returned python
+                       dictionary will use index as key and word as value.
+
+    Returns:
+        dict: The word dictionary for the specific language.
+    """
+
+    if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS)
+    else: dict_size = min(dict_size, TOTAL_DE_WORDS)
+
+    dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "wmt16/%s_%d.dict" % (lang, dict_size))
+    assert os.path.exists(dict_path), "Word dictionary does not exist. "
+    "Please invoke paddle.dataset.wmt16.train/test/validation first "
+    "to build the dictionary."
+    tar_file = os.path.join(paddle.dataset.common.DATA_HOME, "wmt16.tar.gz")
+    return __load_dict(tar_file, dict_size, lang, reverse)
+
+
+def fetch():
+    """download the entire dataset.
+    """
+    paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                      "wmt16.tar.gz")
+
+
+def convert(path, src_dict_size, trg_dict_size, src_lang):
+    """Converts dataset to recordio format.
+    """
+
+    paddle.dataset.common.convert(
+        path,
+        train(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_train")
+    paddle.dataset.common.convert(
+        path,
+        test(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_test")
+    paddle.dataset.common.convert(
+        path,
+        validation(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_validation")
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index fcea28220485039c9daf3c5fa2688c31f9f34c42..5ea4d977f4d8d9eb56b1fefa16f429df6e2a15bb 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -41,6 +41,7 @@ from memory_optimization_transpiler import memory_optimize, release_memory
 import profiler
 import unique_name
 import recordio_writer
+from parallel_executor import ParallelExecutor
 
 Tensor = LoDTensor
 
@@ -68,6 +69,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
     'profiler',
     'unique_name',
     'recordio_writer',
+    'ParallelExecutor',
 ]
 
 
diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
index d65e1a6858373d8e172cb8112a10a77d2e9bd5bc..470dd0df524936a773f6e740c8079f0efa8ef7b4 100644
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -82,11 +82,14 @@ class SelectCase(object):
     RECEIVE = 2
 
     def __init__(self,
+                 select,
                  case_idx,
                  case_to_execute,
                  channel_action_fn=None,
                  channel=None,
-                 value=None):
+                 value=None,
+                 is_copy=False):
+        self.select = select
         self.helper = LayerHelper('conditional_block')
         self.main_program = self.helper.main_program
         self.is_scalar_condition = True
@@ -99,7 +102,24 @@ class SelectCase(object):
         self.action = (self.SEND
                        if channel_action_fn.__name__ == ('channel_send') else
                        self.RECEIVE) if channel_action_fn else self.DEFAULT
-        self.value = value
+
+        X = value
+        if self.action == self.SEND and is_copy:
+            # We create of copy of the data we want to send
+            copied_X = self.select.parent_block.create_var(
+                name=unique_name.generate(value.name + '_copy'),
+                type=value.type,
+                dtype=value.dtype,
+                shape=value.shape,
+                lod_level=value.lod_level,
+                capacity=value.capacity
+                if hasattr(value, 'capacity') else None, )
+
+            self.select.parent_block.append_op(
+                type="assign", inputs={"X": value}, outputs={"Out": copied_X})
+            X = copied_X
+
+        self.value = X
         self.channel = channel
 
     def __enter__(self):
@@ -173,6 +193,7 @@ class SelectCase(object):
 class Select(BlockGuard):
     def __init__(self, name=None):
         self.helper = LayerHelper('select', name=name)
+        self.parent_block = self.helper.main_program.current_block()
         self.cases = []
 
         super(Select, self).__init__(self.helper.main_program)
@@ -183,12 +204,12 @@ class Select(BlockGuard):
         super(Select, self).__enter__()
         return self
 
-    def case(self, channel_action_fn, channel, value):
+    def case(self, channel_action_fn, channel, value, is_copy=False):
         """Create a new block for this condition.
         """
-        select_case = SelectCase(
-            len(self.cases), self.case_to_execute, channel_action_fn, channel,
-            value)
+        select_case = SelectCase(self,
+                                 len(self.cases), self.case_to_execute,
+                                 channel_action_fn, channel, value, is_copy)
 
         self.cases.append(select_case)
 
@@ -197,7 +218,7 @@ class Select(BlockGuard):
     def default(self):
         """Create a default case block for this condition.
         """
-        default_case = SelectCase(len(self.cases), self.case_to_execute)
+        default_case = SelectCase(self, len(self.cases), self.case_to_execute)
 
         self.cases.append(default_case)
 
@@ -339,35 +360,26 @@ def channel_send(channel, value, is_copy=False):
     main_program = helper.main_program
     channel_send_block = main_program.current_block()
 
-    status = helper.create_variable(
-        name=unique_name.generate('status'),
-        type=core.VarDesc.VarType.LOD_TENSOR,
-        dtype=core.VarDesc.VarType.BOOL)
-
     X = value
 
-    if is_copy is True:
+    if is_copy:
         copied_X = helper.create_variable(
             name=unique_name.generate(value.name + '_copy'),
             type=value.type,
             dtype=value.dtype,
             shape=value.shape,
             lod_level=value.lod_level,
-            capacity=value.capacity)
+            capacity=value.capacity if hasattr(value, 'capacity') else None)
 
         assign_op = channel_send_block.append_op(
-            type="assign_op", inputs={"X": value}, outputs={"Out": copied_X})
+            type="assign", inputs={"X": value}, outputs={"Out": copied_X})
         X = copied_X
 
-    channel_send_op = channel_send_block.append_op(
-        type="channel_send",
-        inputs={
+    channel_send_block.append_op(
+        type="channel_send", inputs={
             "Channel": channel,
             "X": X,
-        },
-        outputs={"Status": status})
-
-    return status
+        })
 
 
 def channel_recv(channel, return_value):
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 62147d325b699a62bd39cfbaca44874b7fc19a0f..9311fc9904eb730aa56e94a4e45a1479a67df641 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -276,20 +276,25 @@ class DistributeTranspiler:
             suff_idx = v.name.find(".trainer_")
             if suff_idx >= 0:
                 orig_var_name = v.name[:suff_idx]
-            pserver_program.global_block().create_var(
+            else:
+                orig_var_name = v.name
+            single_trainer_var = pserver_program.global_block().create_var(
                 name=orig_var_name,
                 persistable=True,
                 type=v.type,
                 dtype=v.dtype,
                 shape=v.shape)
-            for trainer_id in xrange(self.trainers):
-                var = pserver_program.global_block().create_var(
-                    name="%s.trainer_%d" % (orig_var_name, trainer_id),
-                    persistable=False,
-                    type=v.type,
-                    dtype=v.dtype,
-                    shape=v.shape)
-                recv_inputs.append(var)
+            if self.trainers > 1:
+                for trainer_id in xrange(self.trainers):
+                    var = pserver_program.global_block().create_var(
+                        name="%s.trainer_%d" % (orig_var_name, trainer_id),
+                        persistable=False,
+                        type=v.type,
+                        dtype=v.dtype,
+                        shape=v.shape)
+                    recv_inputs.append(var)
+            else:
+                recv_inputs.append(single_trainer_var)
 
         # step3
         optimize_block = pserver_program.create_block(0)
@@ -338,15 +343,24 @@ class DistributeTranspiler:
             else:
                 self._append_pserver_non_opt_ops(block, op)
 
+        append_block = optimize_block
+        # append lr decay ops to the child block if exits
+        lr_ops = self._get_lr_ops()
+        if len(lr_ops) > 0:
+            for _, op in enumerate(lr_ops):
+                self._append_pserver_non_opt_ops(append_block, op)
+
+            append_block = pserver_program.create_block(append_block.idx)
+
         # append op to the current block
-        per_opt_block = optimize_block
+        per_opt_block = append_block
         for _, opt_op in enumerate(opt_op_on_pserver):
             for _, op in enumerate(self.optimize_ops):
                 # optimizer is connected to itself
                 if ufind.is_connected(op, opt_op) and \
                     op not in global_ops:
                     __append_optimize_op__(op, per_opt_block)
-            per_opt_block = pserver_program.create_block(0)
+            per_opt_block = pserver_program.create_block(append_block.idx)
 
         # append global ops
         for glb_op in global_ops:
@@ -502,8 +516,11 @@ class DistributeTranspiler:
 
     def _append_split_op(self, program, gradblocks):
         # Split variables that need to be split and append respective ops
+        add_suffix = False
+        if self.trainers > 1:
+            add_suffix = True
         var_mapping = self._create_vars_from_blocklist(
-            program, gradblocks, add_trainer_suffix=True)
+            program, gradblocks, add_trainer_suffix=add_suffix)
         for varname, splited_vars in var_mapping.iteritems():
             # variable that don't need to split have empty splited_vars
             if len(splited_vars) <= 1:
@@ -786,3 +803,33 @@ class DistributeTranspiler:
             else:
                 iomap[key] = vars
         return iomap
+
+    def _get_lr_ops(self):
+        lr_ops = []
+        # find learning rate variables by optimize op
+        lr_vars = set()
+        for op in self.optimize_ops:
+            if self._is_opt_op(op):
+                lr_vars.add(op.input("LearningRate")[0])
+
+        find_ops = []
+        # find ops which output is lr var
+        block = self.program.global_block()
+        for op in block.ops:
+            if set(op.output_arg_names) & lr_vars:
+                find_ops.append(op)
+        # make a union find struct by the ops in default_main_program
+        ufind = UnionFind(block.ops)
+        for op1 in block.ops:
+            for op2 in block.ops:
+                # NOTE: we need to skip all optimize ops, since it is connected
+                # with forward/backward ops and lr ops, we only need the lr ops.
+                if op1 != op2 and self._is_op_connected(op1, op2) and \
+                    not self._is_opt_op(op1) and not self._is_opt_op(op2):
+                    ufind.union(op1, op2)
+        # find all ops which is related with lr var
+        for op1 in block.ops:
+            for op2 in find_ops:
+                if ufind.is_connected(op1, op2):
+                    lr_ops.append(op1)
+        return lr_ops
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 2612fb1ae41986ae0d5c6e942cc3accebcb00e19..54d0a12bcdbb1b6c13e584dd1a3a5d73cddd4af7 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -48,8 +48,7 @@ def as_numpy(tensor):
     assert isinstance(tensor, core.LoDTensor)
     lod = tensor.lod()
     if len(lod) > 0:
-        raise RuntimeError(
-            "Some of your featched tensors hold LoD information. \
+        raise RuntimeError("Some of your fetched tensors hold LoD information. \
             They can not be completely cast to Python ndarray. \
             Please set the parameter 'return_numpy' as 'False' to \
             return LoDTensor itself directly.")
@@ -180,60 +179,24 @@ def get_program_cache_key(feed, fetch_list):
 
 
 class Executor(object):
-    def __init__(self, places):
-        if not isinstance(places, list) and not isinstance(places, tuple):
-            places = [places]
-
-        act_places = []
-        for each in places:
-            p = core.Place()
-            p.set_place(each)
-            act_places.append(p)
-
-        # TODO(dzhwinter) : only use the first place
-        self.executor = core.Executor(act_places[0])
-        self.places = places
+    def __init__(self, place):
+        self.place = place
+        p = core.Place()
+        p.set_place(place)
+        self.executor = core.Executor(p)
         self.program_caches = dict()
 
-    def aslodtensor(self, data):
-        def accumulate(data):
-            if not isinstance(data, list):
-                return 1
-            return sum([accumulate(sub) for sub in data])
-
-        def parselod(data):
-            seq_lens = [accumulate(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            return lod
-
-        assert len(self.places) != 0
-        if not isinstance(data, list):
-            # pure tensor case
-            tensor = core.LoDTensor()
-            tensor.set(data, self.places[0])
-            return tensor
-        else:
-            raise RuntimeError("Current implementation lacks unittests")
-            # lodtensor case
-            lod = []
-            if not isinstance(data[0], list):
-                lod.append(parselod(data))
-                flattened_data = np.concatenate(data, axis=0).astype("int64")
-            else:
-                while isinstance(data[0], list):
-                    lod.append(parselod(seq))
-                    flattened_data = [item for seq in data for item in seq]
-                    data = flattened_data
-                flattened_data = np.concatenate(data, axis=0).astype("int64")
-            flattened_data = flattened_data.reshape([len(flattened_data), 1])
-            tensor = core.LoDTensor()
-            tensor.set(flattened_data, self.places[0])
-            tensor.set_lod(lod)
-            return tensor
+    def as_lodtensor(self, data):
+        if isinstance(data, list):
+            raise RuntimeError("Some of your feed data hold LoD information. \
+                They can not be completely cast from a list of Python \
+                ndarray to LoDTensor. Please convert data to LoDTensor \
+                directly before feeding the data.\
+                ")
+        # single tensor case
+        tensor = core.LoDTensor()
+        tensor.set(data, self.place)
+        return tensor
 
     def _get_program_cache(self, program_cache_key):
         return self.program_caches.get(program_cache_key, None)
@@ -293,7 +256,7 @@ class Executor(object):
                 feed_target_name = op.desc.output('Out')[0]
                 cur_feed = feed[feed_target_name]
                 if not isinstance(cur_feed, core.LoDTensor):
-                    cur_feed = self.aslodtensor(cur_feed)
+                    cur_feed = self.as_lodtensor(cur_feed)
                 idx = op.desc.attr('col')
                 core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
             else:
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 3e78788f470556d2196b5104f69a0a3285543ec4..e15456bfc0835066e3c899aea7e2cf642b4797d8 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -847,6 +847,11 @@ class Block(object):
             if not self.has_var(var.name()):
                 self.create_var(name=var.name(), desc=var, type=var.type())
 
+        # sync variables removed from c++ end
+        for var in self.vars.keys():
+            if not self.desc.find_var(var):
+                self.vars.pop(var)
+
         # sync operators from cpp
         ops_in_cpp = []
         for op_idx in range(0, self.desc.op_size()):
@@ -881,6 +886,19 @@ class Block(object):
             op = Operator(self, op_desc)
             self.ops.append(op)
 
+        # sync ops removed from c++ end
+        if end_index != -1 and end_index < len(self.ops):
+            ops_in_cpp_index = 0
+            ops_in_python_index = 0
+            while ops_in_python_index < len(
+                    self.ops) and ops_in_cpp_index < len(ops_in_cpp):
+                if self.ops[ops_in_python_index].desc != ops_in_cpp[
+                        ops_in_cpp_index]:
+                    del self.ops[ops_in_python_index]
+                else:
+                    ops_in_cpp_index += 1
+                    ops_in_python_index += 1
+
         assert len(self.ops) == len(ops_in_cpp)
         for index in range(len(self.ops)):
             assert self.ops[index].desc == ops_in_cpp[index]
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 1bb1aa30ee1019c6f80eb64b6dc20459e7a3073b..b9a53eda9144e9e56cf9bc626db40cf4225bd87f 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -18,6 +18,7 @@ from tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
+from ..initializer import force_init_on_cpu
 from ops import logical_and, logical_not, logical_or
 
 __all__ = [
@@ -949,7 +950,7 @@ def create_array(dtype):
         dtype=dtype)
 
 
-def less_than(x, y, cond=None, **ignored):
+def less_than(x, y, force_cpu=True, cond=None, **ignored):
     """
     **Less than**
 
@@ -958,6 +959,7 @@ def less_than(x, y, cond=None, **ignored):
     Args:
         x(Variable): First operand of *less_than*
         y(Variable): Second operand of *less_than*
+        force_cpu(Bool|True): The output data will be on CPU if set true.
         cond(Variable|None): Optional output variable to store the result of *less_than*
 
     Returns:
@@ -974,8 +976,11 @@ def less_than(x, y, cond=None, **ignored):
         cond.stop_gradient = True
 
     helper.append_op(
-        type='less_than', inputs={'X': [x],
-                                  'Y': [y]}, outputs={'Out': [cond]})
+        type='less_than',
+        inputs={'X': [x],
+                'Y': [y]},
+        outputs={'Out': [cond]},
+        attrs={'force_cpu': force_cpu or force_init_on_cpu()})
     return cond
 
 
@@ -1396,7 +1401,8 @@ class DynamicRNN(object):
                 type='less_than',
                 inputs={'X': self.step_idx,
                         'Y': self.max_seq_len},
-                outputs={'Out': self.cond})
+                outputs={'Out': self.cond},
+                attrs={'force_cpu': True})
 
         input_array = parent_block.create_var(
             name=unique_name.generate('dynamic_rnn_input_array'),
@@ -1445,7 +1451,11 @@ class DynamicRNN(object):
             for new_mem, mem_array in self.mem_link:
                 array_write(x=new_mem, i=self.step_idx, array=mem_array)
 
-            less_than(x=self.step_idx, y=self.max_seq_len, cond=self.cond)
+            less_than(
+                x=self.step_idx,
+                y=self.max_seq_len,
+                force_cpu=True,
+                cond=self.cond)
 
         self.status = DynamicRNN.AFTER_RNN
         for each_array in self.output_array:
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index cd519e1ee082d27ccadc6247c149701fac31e812..a5938fe494265778ef7032c56a8d6d35acd729c5 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -19,7 +19,6 @@ from layer_function_generator import generate_layer_fn
 from layer_function_generator import autodoc
 from ..layer_helper import LayerHelper
 import tensor
-import ops
 import nn
 import math
 
@@ -58,7 +57,7 @@ def detection_output(loc,
 
     This operation is to get the detection results by performing following
     two steps:
-    
+
     1. Decode input bounding box predictions according to the prior boxes.
     2. Get the final detection results by applying multi-class non maximum
        suppression (NMS).
@@ -130,10 +129,11 @@ def detection_output(loc,
         target_box=loc,
         code_type='decode_center_size')
     old_shape = scores.shape
-    scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
+    scores = nn.reshape(x=scores, shape=(-1, old_shape[-1]))
     scores = nn.softmax(input=scores)
-    scores = ops.reshape(x=scores, shape=old_shape)
+    scores = nn.reshape(x=scores, shape=old_shape)
     scores = nn.transpose(scores, perm=[0, 2, 1])
+    scores.stop_gradient = True
     nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
     helper.append_op(
         type="multiclass_nms",
@@ -148,6 +148,7 @@ def detection_output(loc,
             'score_threshold': score_threshold,
             'nms_eta': 1.0
         })
+    nmsed_outs.stop_gradient = True
     return nmsed_outs
 
 
@@ -461,7 +462,7 @@ def ssd_loss(location,
     num, num_prior, num_class = confidence.shape
 
     def __reshape_to_2d(var):
-        return ops.reshape(x=var, shape=[-1, var.shape[-1]])
+        return nn.reshape(x=var, shape=[-1, var.shape[-1]])
 
     # 1. Find matched boundding box by prior box.
     #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
@@ -472,7 +473,7 @@ def ssd_loss(location,
 
     # 2. Compute confidence for mining hard examples
     # 2.1. Get the target label based on matched indices
-    gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, ))
     gt_label.stop_gradient = True
     target_label, _ = target_assign(
         gt_label, matched_indices, mismatch_value=background_label)
@@ -485,7 +486,7 @@ def ssd_loss(location,
     conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
 
     # 3. Mining hard examples
-    conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior))
+    conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior))
     conf_loss.stop_gradient = True
     neg_indices = helper.create_tmp_variable(dtype='int32')
     dtype = matched_indices.dtype
@@ -554,7 +555,7 @@ def ssd_loss(location,
     # 5.3 Compute overall weighted loss.
     loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
     # reshape to [N, Np], N is the batch size and Np is the prior box number.
-    loss = ops.reshape(x=loss, shape=[-1, num_prior])
+    loss = nn.reshape(x=loss, shape=[-1, num_prior])
     loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
     if normalize:
         normalizer = nn.reduce_sum(target_loc_weight)
@@ -707,7 +708,7 @@ def multi_box_head(inputs,
         new_shape = [
             -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)])
         ]
-        out = ops.reshape(x=input, shape=new_shape)
+        out = nn.reshape(x=input, shape=new_shape)
         return out
 
     def _is_list_or_tuple_(data):
@@ -801,7 +802,7 @@ def multi_box_head(inputs,
             mbox_loc.shape[0],
             mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
         ]
-        mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape)
+        mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape)
         mbox_locs.append(mbox_loc_flatten)
 
         # get conf
@@ -817,7 +818,7 @@ def multi_box_head(inputs,
             conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
             conf_loc.shape[3] / num_classes, num_classes
         ]
-        conf_loc_flatten = ops.reshape(conf_loc, shape=new_shape)
+        conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape)
         mbox_confs.append(conf_loc_flatten)
 
     if len(box_results) == 1:
@@ -837,4 +838,6 @@ def multi_box_head(inputs,
         mbox_locs_concat = tensor.concat(mbox_locs, axis=1)
         mbox_confs_concat = tensor.concat(mbox_confs, axis=1)
 
+    box.stop_gradient = True
+    var.stop_gradient = True
     return mbox_locs_concat, mbox_confs_concat, box, var
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 2db4e5d27d40e902e2da17b3748c64a1eb3d052b..3d13133bf25aa3f538f6f574bd2ae682e1bc7e39 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -73,8 +73,10 @@ __all__ = [
     'smooth_l1',
     'one_hot',
     'autoincreased_step_counter',
+    'reshape',
     'lod_reset',
     'lrn',
+    'pad',
 ]
 
 
@@ -1483,6 +1485,7 @@ def batch_norm(input,
                param_attr=None,
                bias_attr=None,
                data_layout='NCHW',
+               in_place=False,
                name=None,
                moving_mean_name=None,
                moving_variance_name=None):
@@ -1538,7 +1541,7 @@ def batch_norm(input,
     saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
     saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
 
-    batch_norm_out = helper.create_tmp_variable(dtype)
+    batch_norm_out = input if in_place else helper.create_tmp_variable(dtype)
 
     helper.append_op(
         type="batch_norm",
@@ -3264,6 +3267,8 @@ def one_hot(input, depth):
          The one-hot tensor or LodTensor, same as input.
 
     Examples:
+        .. code-block:: python
+
         X is a LoDTensor:
           X.lod = [[0, 1, 4]]
           X.shape = [4, 1]
@@ -3318,6 +3323,102 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     return counter
 
 
+def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
+    """
+    Gives a new shape to the input Tensor without changing its data.
+
+    The target shape can be given by :attr:`shape` or :attr:`actual_shape`.
+    :attr:`shape` is a list of integer while :attr:`actual_shape` is a tensor
+    variable. :attr:`actual_shape` has a higher priority than :attr:`shape`
+    if it is provided, while :attr:`shape` still should be set correctly to
+    gurantee shape inference in compile-time.
+
+    Some tricks exist when specifying the target shape.
+
+    1. -1 means the value of this dimension is inferred from the total element
+    number of x and remaining dimensions. Thus one and only one dimension can
+    be set -1.
+
+    2. 0 means the actual dimension value is going to be copied from the
+    corresponding dimension of x. The indice of 0s in shape can not exceed
+    Rank(X).
+
+    Here are some examples to explain it.
+
+    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with 
+    shape [6, 8] and leaving x's data unchanged.
+
+    2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    specified is [2, 3, -1, 2], the reshape operator will transform x into a
+    4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
+    case, one dimension of the target shape is set to -1, the value of this 
+    dimension is inferred from the total element number of x and remaining 
+    dimensions.
+
+    3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor
+    with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case,
+    besides -1, 0 means the actual dimension value is going to be copied from
+    the corresponding dimension of x.
+
+    Args:
+        input(variable): The input tensor.
+        shape(list): The new shape. At most one dimension of the new shape can
+                     be -1.
+        actual_shape(variable): An optional input. If provided, reshape
+                                according to this given shape rather than
+                                :attr:`shape` specifying shape. That is to
+                                say :attr:`actual_shape` has a higher priority
+                                than :attr:`shape`.
+        act (str): The non-linear activation to be applied to output variable.
+        inplace(bool): If this flag is set true, a new output tensor is created
+                       whose data is copied from input x, otherwise the output
+                       shares data with input without copying.
+
+    Returns(variable): The output tensor.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(
+                name='data', shape=[2, 4, 6], dtype='float32')
+            reshaped = fluid.layers.reshape(
+                x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True)
+    """
+
+    if not (isinstance(shape, list) or isinstance(shape, tuple)):
+        raise ValueError("Input shape must be a python lsit or tuple.")
+
+    # Validate the shape
+    unk_dim_idx = -1
+    for dim_idx, dim_size in enumerate(shape):
+        if dim_size == -1:
+            assert unk_dim_idx == -1, (
+                "Only one dimension in shape can be unknown.")
+            unk_dim_idx = dim_idx
+        elif dim_size == 0:
+            assert dim_idx < len(x.shape), (
+                "The indice of 0s in shape can not exceed Rank(X).")
+        else:
+            assert dim_size > 0, (
+                "Each dimension size given in shape must not be negtive "
+                "except one unknown dimension.")
+
+    helper = LayerHelper("reshape", **locals())
+    reshaped = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="reshape",
+        inputs={"X": x,
+                "Shape": actual_shape}
+        if isinstance(actual_shape, Variable) else {"X": x},
+        attrs={"shape": shape,
+               "inplace": inplace},
+        outputs={"Out": reshaped})
+
+    return helper.append_activation(reshaped)
+
+
 def lod_reset(x, y=None, target_lod=None):
     """
     LoD Reset Operator. Set LoD of **x** to a new one specified by **y** or
@@ -3481,3 +3582,62 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
                "beta": beta})
 
     return lrn_out
+
+
+def pad(x, paddings, pad_value=0., name=None):
+    """
+    Pads a tensor with a constant value given by :attr:`pad_value`, and the
+    padded width is specified by :attr:`paddings`. 
+
+    Specifically, the number of values padded before the contents of :attr:`x`
+    in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
+    of values padded after the contents of :attr:`x` in dimension :attr:`i` is
+    indicated by :attr:`paddings[i+1]`.
+
+    See below for an example.
+
+    .. code-block:: text
+
+        Given:
+            x = [[1, 2], [3, 4]]
+
+            paddings = [0, 1, 1, 2]
+
+            pad_value = 0
+
+        Return:
+
+            out = [[0, 1, 2, 0, 0]
+                   [0, 3, 4, 0, 0]
+                   [0, 0, 0, 0, 0]]
+
+    Args:
+        x (Variable): The input tensor variable.
+        paddings (list): A list of integers. Its elements specify the padded
+                         width before and after for each dimension in turn.
+                         The length of :attr:paddings must be 
+                         :math:`rank(x) \\times 2`.
+        pad_value (float): The constant value used to pad.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The padded tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a rank 2 tensor variable.
+            out = fluid.layers.pad(
+                x=x, paddings=[0, 1, 1, 2], pad_value=0.)
+    """
+    helper = LayerHelper('pad', input=x, **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='pad',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'paddings': paddings,
+               'pad_value': float(pad_value)})
+    return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index f5c6b47d243dcf4ba985cfb41fc23b44d3ed809f..a9fe25744cc0b385479c9366af1b731ec221dd5a 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -25,6 +25,8 @@ __activations__ = [
     'abs',
     'ceil',
     'floor',
+    'cos',
+    'sin',
     'round',
     'reciprocal',
     'log',
@@ -47,7 +49,6 @@ __activations__ = [
 __all__ = [
     'mean',
     'mul',
-    'reshape',
     'scale',
     'sigmoid_cross_entropy_with_logits',
     'elementwise_add',
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 3b2e1a3073251a6d6460450dc957e1b5c7a873c5..bbedf6fde0872fd32d81c103bf5fe61449b7f57b 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -98,7 +98,7 @@ def img_conv_group(input,
             use_mkldnn=use_mkldnn)
 
         if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(input=tmp, act=conv_act)
+            tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True)
             drop_rate = conv_batchnorm_drop_rate[i]
             if abs(drop_rate) > 1e-5:
                 tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2c830b3c943b114f3024f23f73f78bf87e1da34
--- /dev/null
+++ b/python/paddle/fluid/parallel_executor.py
@@ -0,0 +1,72 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import core
+import multiprocessing
+import framework
+import executor
+
+__all__ = ['ParallelExecutor']
+
+
+class ParallelExecutor(object):
+    def __init__(self,
+                 loss_name,
+                 use_cuda,
+                 num_threads=None,
+                 allow_op_delay=False):
+        places = []
+        if use_cuda:
+            for i in xrange(core.get_cuda_device_count()):
+                p = core.Place()
+                p.set_place(core.CUDAPlace(i))
+                places.append(p)
+        else:
+            for i in xrange(multiprocessing.cpu_count()):
+                p = core.Place()
+                p.set_place(core.CPUPlace())
+                places.append(p)
+
+        if num_threads is None:
+            if use_cuda:
+                # Experiments on se-resnext shows that too many threads hurt
+                # performance. Worth tunning for other models in the future.
+                num_threads = len(places)
+            else:
+                min(len(places) * 2, multiprocessing.cpu_count())
+
+        startup = framework.default_startup_program()
+        main = framework.default_main_program()
+        scope = executor.global_scope()
+
+        self.executor = core.ParallelExecutor(
+            num_threads,
+            True if use_cuda else False,  # use_event
+            places,
+            set([
+                p.name for p in main.global_block().iter_parameters()
+                if not p.stop_gradient
+            ]),
+            startup.desc,
+            main.desc,
+            loss_name,
+            scope,
+            allow_op_delay)
+        self.scope = scope
+
+    def run(self, fetch_list):
+        fetch_var_name = '@FETCHED_VAR_NAME@'
+        self.executor.run(fetch_list, fetch_var_name)
+        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
+        return [arr[i] for i in range(len(arr))]
diff --git a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
index 983f8f4dbeac83566839de25ec9765eb248be768..ce640dece8a5067bd10f410a2bb58874b7cc0908 100644
--- a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
+++ b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 93ef66851b0efd65361122853dadeefe11992ed5..6dfc2997ae0328a41fe22d13dfa8fc51d4d021a6 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import contextlib
 import numpy
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index b01c1875d64d7fc14e0141672f7e8eab2b6a0394..e8bb082be196b6342b1719235f1264bbe3d776ac 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import contextlib
 import math
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index f488527e0bc69059bc44422aa28188441f3d5b54..c0a6df831acbfe2654a5941cf95c91343992ef13 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -15,8 +15,8 @@
 import math
 
 import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.dataset.conll05 as conll05
+import paddle
+import paddle.dataset.conll05 as conll05
 import paddle.fluid as fluid
 from paddle.fluid.initializer import init_on_cpu
 import contextlib
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 3a1a0859ecfd4ac5337e2112f8b22e32d8474f22..830d78df8b9e56b45f7e928562ef4b89e88f696d 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -14,7 +14,7 @@
 import contextlib
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 import paddle.fluid.layers as pd
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index e85b97a7f430b6d752baa179f27a7d15bc4d9a81..e4997b4069f60ff4382b4254bc026ae8ae29b345 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -14,7 +14,7 @@
 from __future__ import print_function
 import argparse
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import sys
 import numpy
 import unittest
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 2ce66d32c993672793b0db213267d1f80b5c49dd..2172c275b8082689a6ff5f2c3c27a2ff4e92275a 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -16,7 +16,7 @@ import math
 import sys
 import os
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 import paddle.fluid.layers as layers
diff --git a/python/paddle/fluid/tests/book/test_understand_sentiment.py b/python/paddle/fluid/tests/book/test_understand_sentiment.py
index d2f3f7404697feb0768f873070b97aeb3ba0cd64..dedd153778d7ad9caeb5fa7090a980bc7f177dea 100644
--- a/python/paddle/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/test_understand_sentiment.py
@@ -15,7 +15,7 @@ from __future__ import print_function
 
 import unittest
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import contextlib
 import math
 import numpy as np
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 26b97c3e254f54b83515436660e44d4908c98fbe..8929779de9448d036e1528b64330b37463ab3988 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import unittest
 import os
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index ad79e96b958b36a06c8a3cc990dbe3608e32c9ac..8818cf96fa8f08036f9e23aae786f67b5614b2b9 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import math
 import sys
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index 204669d7e6176e9e8250e8aebc2d10441fa24b67..dfebb9a06ea4f290f128c486dcaccaeccdcef8c4 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import sys
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import math
 import sys
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index a24834a6f0b19d1265f6c8d7089d31583af82d1f..a1ca6d981fafb401985d03e9f2d63d1cb41b21b5 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
diff --git a/python/paddle/fluid/tests/demo/fc_gan.py b/python/paddle/fluid/tests/demo/fc_gan.py
index 7452ea2a34aa0c75d8e0990639b29705033af98b..8ea1b2b15cc0c0eb5bca67a9c5a6ac6c6774e7e2 100644
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
@@ -19,7 +19,7 @@ import os
 import matplotlib
 import numpy
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 matplotlib.use('Agg')
diff --git a/python/paddle/fluid/tests/test_concurrency.py b/python/paddle/fluid/tests/test_concurrency.py
index 924895a9afac610059bac5f617c49712441339cc..e8f6cfb4a907b2c01e9662e7e9bf2cb0fbd6cb1b 100644
--- a/python/paddle/fluid/tests/test_concurrency.py
+++ b/python/paddle/fluid/tests/test_concurrency.py
@@ -173,16 +173,10 @@ class TestRoutineOp(unittest.TestCase):
                 with while_op.block():
                     result2 = fill_constant(
                         shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
-                    x_to_send_tmp = fill_constant(
-                        shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
-
-                    # TODO(abhinav): Need to perform copy when doing a channel send.
-                    #   Once this is complete, we can remove these lines
-                    assign(input=x, output=x_to_send_tmp)
 
                     with fluid.Select() as select:
-                        with select.case(fluid.channel_send, channel,
-                                         x_to_send_tmp):
+                        with select.case(
+                                fluid.channel_send, channel, x, is_copy=True):
                             assign(input=x, output=x_tmp)
                             assign(input=y, output=x)
                             assign(elementwise_add(x=x_tmp, y=y), output=y)
@@ -230,21 +224,12 @@ class TestRoutineOp(unittest.TestCase):
                                               core.VarDesc.VarType.LOD_TENSOR,
                                               core.VarDesc.VarType.FP64)
 
-            pong_result = self._create_tensor('pong_return_value',
-                                              core.VarDesc.VarType.LOD_TENSOR,
-                                              core.VarDesc.VarType.FP64)
-
             def ping(ch, message):
-                message_to_send_tmp = fill_constant(
-                    shape=[1], dtype=core.VarDesc.VarType.FP64, value=0)
-
-                assign(input=message, output=message_to_send_tmp)
-                fluid.channel_send(ch, message_to_send_tmp)
+                fluid.channel_send(ch, message, is_copy=True)
 
             def pong(ch1, ch2):
                 fluid.channel_recv(ch1, ping_result)
-                assign(input=ping_result, output=pong_result)
-                fluid.channel_send(ch2, pong_result)
+                fluid.channel_send(ch2, ping_result, is_copy=True)
 
             pings = fluid.make_channel(
                 dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
diff --git a/python/paddle/fluid/tests/test_cpp_reader.py b/python/paddle/fluid/tests/test_cpp_reader.py
index 4b0d039b7e05a55980946a8949e32802e9e57c20..e54c73b2956dd99ee57804318130c261e133d21a 100644
--- a/python/paddle/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index b2fd5ae29c724da52df0a5d3cb56d2ec9e5530f3..89f4c64975802dc1827ec17ed3626b91e36d6971 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 BATCH_SIZE = 128
diff --git a/python/paddle/fluid/tests/test_gradient_clip.py b/python/paddle/fluid/tests/test_gradient_clip.py
index 68b682f68b1fd147b821cfdb1e0866cf8aa04bff..d530601f13be6810a8a99b13c92faf584df568f9 100644
--- a/python/paddle/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/test_gradient_clip.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 BATCH_SIZE = 128
diff --git a/python/paddle/fluid/tests/test_mnist_if_else_op.py b/python/paddle/fluid/tests/test_mnist_if_else_op.py
index 94395f6cfb4648967558ed265e798e3505c20fc1..d34f52db5ffc889f17513d034ad2c99f696b0cdf 100644
--- a/python/paddle/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/fluid/tests/test_mnist_if_else_op.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard, default_main_program, default_startup_program
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import MomentumOptimizer
 import paddle.fluid.core as core
-import paddle.v2 as paddle
 import unittest
 import numpy as np
 
diff --git a/python/paddle/fluid/tests/unittests/.gitignore b/python/paddle/fluid/tests/unittests/.gitignore
index ad02bdecf436bba925e2e3b7efb20c878df70dfd..3538a9c2009bb133609153427981fb66974377fa 100644
--- a/python/paddle/fluid/tests/unittests/.gitignore
+++ b/python/paddle/fluid/tests/unittests/.gitignore
@@ -2,3 +2,5 @@ mnist.recordio
 mnist_0.recordio
 mnist_1.recordio
 mnist_2.recordio
+flowers.recordio
+wmt16.recordio
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 0ad273c7161977e18f91f952fd3a9dc144bf73f0..1b2d29a47fd050e40f83443432f8194984c71214 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -29,6 +29,7 @@ function(py_test_modules TARGET_NAME)
 endfunction()
 
 # test time consuming OPs in a separate process for expliot parallism
+list(REMOVE_ITEM TEST_OPS test_parallel_executor)
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dyn_rnn)
 list(REMOVE_ITEM TEST_OPS test_mul_op)
@@ -64,6 +65,7 @@ else()
 endif(WITH_FAST_BUNDLE_TEST)
 
 # tests with high overhead
+py_test_modules(test_parallel_executor MODULES test_parallel_executor)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
 py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn)
 py_test_modules(test_mul_op MODULES test_mul_op)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 8393f7827b1c7d361ebea72f2cfc6033268772f0..299ab8e51f017e1980a8b40e3830fc42b1ff7ccc 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase):
                     np.allclose(
                         actual_t, expect_t, atol=atol),
                     "Output (" + out_name + ") has diff at " + str(place) +
-                    str(actual_t) + str(expect_t))
+                    str(actual_t) + "\n" + str(expect_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.lod(), expect[1],
                                          "Output (" + out_name +
@@ -568,6 +568,6 @@ class OpTest(unittest.TestCase):
 
         fetch_list = [g for p, g in param_grad_list]
         executor = Executor(place)
-        return map(
-            np.array,
-            executor.run(prog, feed_dict, fetch_list, return_numpy=False))
+        return map(np.array,
+                   executor.run(prog, feed_dict, fetch_list,
+                                return_numpy=False))
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 4a2b35322dd4b9718c83eb5ee679ada382938441..fb162f8b7315936824ad40aca0c99e4dd09f9734 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -196,6 +196,34 @@ class TestFloor(OpTest):
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
+class TestCos(OpTest):
+    def setUp(self):
+        self.op_type = "cos"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.cos(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestSin(OpTest):
+    def setUp(self):
+        self.op_type = "sin"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.sin(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
 class TestRound(OpTest):
     def setUp(self):
         self.op_type = "round"
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index df7ab0d29bdfc9410cd7dd4a8f2a7cd440ef4aba..0faed94deb4808783027d776e0f4c61da0db457a 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import unittest
 import numpy
 
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index b03a70f1b9e61162d37541ffeba8510fc11c605a..d3f63ee2c414a71309be8f0af6d3e5912078ecdb 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid.backward import append_backward
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index ed920ad388ff0e01887404e70fe82565b4cd28fa..3f739afd2516fdc2bdf3711d4780a1196c6f3f13 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -96,5 +96,47 @@ class TestLookupTableIdsIsSelectedRows(OpTest):
             self.check_with_place(place)
 
 
+class TestLookupTableWIsSelectedRows(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Id Variable
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
+        ids_tensor.set(ids_array, place)
+
+        # create and initialize W Variable
+        rows = [0, 1, 2, 3, 4, 5, 6]
+        row_numel = 12
+
+        w_selected_rows = scope.var('W').get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        ids_tensor = w_selected_rows.get_tensor()
+        ids_tensor.set(w_array, place)
+
+        # create Out Variable
+        Out_tensor = scope.var('Out').get_tensor()
+
+        # create and run lookup_table operator
+        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
+        lookup_table.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(Out_tensor)
+        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
+        for idx, row in enumerate(ids_array):
+            assert (row[0] == result_array[idx]).all()
+
+    def test_w_is_selected_rows(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index 2268eafdbd08cd0d6a175d19cedd79b7b984289b..8fa480b9bce84d2936f23cce9e41e8e54014b074 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -97,5 +97,24 @@ class TestLRNMKLDNNOp(TestLRNOp):
         self.check_output(atol=0.002)
 
 
+class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
+    def get_attrs(self):
+        attrs = TestLRNMKLDNNOp.get_attrs(self)
+        attrs['is_test'] = True
+        return attrs
+
+    def test_check_grad_normal(self):
+        def check_raise_is_test():
+            try:
+                self.check_grad(['X'], 'Out', max_relative_error=0.01)
+            except Exception as e:
+                t = \
+                "is_test attribute should be set to False in training phase."
+                if t in str(e):
+                    raise AttributeError
+
+        self.assertRaises(AttributeError, check_raise_is_test)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
old mode 100755
new mode 100644
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
index 8add353303e3626bbce68199a100306d4858766a..0b7a29075939a548320185947b5afa7261029d49 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -15,8 +15,8 @@
 import unittest
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
+import paddle
+import paddle.dataset.mnist as mnist
 
 
 class TestMultipleReader(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_multiple_reader.py b/python/paddle/fluid/tests/unittests/test_multiple_reader.py
index 69f8acf81efaba8fc0f3df4cfe3a42dc4e477df2..a60a5d6c4af2b6b3652d0fe2089018b9403eee25 100644
--- a/python/paddle/fluid/tests/unittests/test_multiple_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multiple_reader.py
@@ -15,8 +15,8 @@
 import unittest
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
+import paddle
+import paddle.dataset.mnist as mnist
 from shutil import copyfile
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a79e4b3e183eaef06be27a724893799923e84ac1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -0,0 +1,435 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import unittest
+
+import paddle.fluid as fluid
+import paddle
+import paddle.dataset.mnist as mnist
+import paddle.dataset.wmt16 as wmt16
+
+
+def simple_fc_net():
+    reader = fluid.layers.open_recordio_file(
+        filename='./mnist.recordio',
+        shapes=[[-1, 784], [-1, 1]],
+        lod_levels=[0, 0],
+        dtypes=['float32', 'int64'])
+    img, label = fluid.layers.read_file(reader)
+    hidden = img
+    for _ in xrange(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def fc_with_batchnorm():
+    reader = fluid.layers.open_recordio_file(
+        filename='./mnist.recordio',
+        shapes=[[-1, 784], [-1, 1]],
+        lod_levels=[0, 0],
+        dtypes=['float32', 'int64'])
+    img, label = fluid.layers.read_file(reader)
+    hidden = img
+    for _ in xrange(1):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+
+        hidden = fluid.layers.batch_norm(input=hidden)
+
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def squeeze_excitation(input, num_channels, reduction_ratio):
+    # pool = fluid.layers.pool2d(
+    #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
+    conv = input
+    shape = conv.shape
+    reshape = fluid.layers.reshape(
+        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+
+    squeeze = fluid.layers.fc(input=pool,
+                              size=num_channels / reduction_ratio,
+                              act='relu')
+    excitation = fluid.layers.fc(input=squeeze,
+                                 size=num_channels,
+                                 act='sigmoid')
+    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+    return scale
+
+
+def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
+                  act=None):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=(filter_size - 1) / 2,
+        groups=groups,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1]
+    if ch_in != ch_out:
+        if stride == 1:
+            filter_size = 1
+        else:
+            filter_size = 3
+        return conv_bn_layer(input, ch_out, filter_size, stride)
+    else:
+        return input
+
+
+def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
+    # The number of first 1x1 convolutional channels for each bottleneck build block
+    # was halved to reduce the compution cost.
+    conv0 = conv_bn_layer(
+        input=input, num_filters=num_filters, filter_size=1, act='relu')
+    conv1 = conv_bn_layer(
+        input=conv0,
+        num_filters=num_filters * 2,
+        filter_size=3,
+        stride=stride,
+        groups=cardinality,
+        act='relu')
+    conv2 = conv_bn_layer(
+        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+    scale = squeeze_excitation(
+        input=conv2,
+        num_channels=num_filters * 2,
+        reduction_ratio=reduction_ratio)
+
+    short = shortcut(input, num_filters * 2, stride)
+
+    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+
+def SE_ResNeXt152Small(batch_size=2):
+    img = fluid.layers.fill_constant(
+        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
+    label = fluid.layers.fill_constant(
+        shape=[batch_size, 1], dtype='int64', value=0.0)
+
+    conv = conv_bn_layer(
+        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
+    conv = conv_bn_layer(
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
+    conv = conv_bn_layer(
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
+    conv = fluid.layers.pool2d(
+        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+
+    cardinality = 64
+    reduction_ratio = 16
+    depth = [3, 8, 36, 3]
+    num_filters = [128, 256, 512, 1024]
+
+    for block in range(len(depth)):
+        for i in range(depth[block]):
+            conv = bottleneck_block(
+                input=conv,
+                num_filters=num_filters[block],
+                stride=2 if i == 0 and block != 0 else 1,
+                cardinality=cardinality,
+                reduction_ratio=reduction_ratio)
+
+    shape = conv.shape
+    reshape = fluid.layers.reshape(
+        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+    # Classifier layer:
+    prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+import time
+
+
+class TestParallelExecutorBase(unittest.TestCase):
+    def check_network_convergence(self,
+                                  method,
+                                  memory_opt=True,
+                                  iter=10,
+                                  batch_size=None,
+                                  allow_op_delay=False):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = method()
+            adam = fluid.optimizer.Adam()
+            adam.minimize(loss)
+            if memory_opt:
+                fluid.memory_optimize(main)
+
+            exe = fluid.ParallelExecutor(
+                loss_name=loss.name,
+                use_cuda=True,
+                allow_op_delay=allow_op_delay)
+            if batch_size is not None:
+                batch_size *= fluid.core.get_cuda_device_count()
+            begin = time.time()
+            first_loss, = exe.run([loss.name])
+            first_loss = numpy.array(first_loss)
+
+            for i in xrange(iter):
+                exe.run([])
+
+            last_loss, = exe.run([loss.name])
+            end = time.time()
+
+            if batch_size is not None:
+                print "%.4f Instance per second" % (
+                    (batch_size * iter + 2) / (end - begin))
+
+            last_loss = numpy.array(last_loss)
+
+            print first_loss, last_loss
+            # self.assertGreater(first_loss[0], last_loss[0])
+
+
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=4)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist.recordio', reader, feeder)
+
+    def test_simple_fc(self):
+        self.check_network_convergence(simple_fc_net)
+        self.check_network_convergence(simple_fc_net, allow_op_delay=True)
+
+    def test_batchnorm_fc(self):
+        self.check_network_convergence(fc_with_batchnorm)
+        self.check_network_convergence(fc_with_batchnorm, allow_op_delay=True)
+
+
+class TestResnet(TestParallelExecutorBase):
+    # @classmethod
+    # def setUpClass(cls):
+    #     # import os
+    #     # if os.path.exists('./flowers.recordio'):
+    #     #     return
+    #     with fluid.program_guard(fluid.Program(), fluid.Program()):
+    #         reader = paddle.batch(flowers.train(), batch_size=4)
+    #         feeder = fluid.DataFeeder(
+    #             feed_list=[
+    #                 fluid.layers.data(
+    #                     name='image', shape=[3, 224, 224]),
+    #                 fluid.layers.data(
+    #                     name='label', shape=[1], dtype='int64'),
+    #             ],
+    #             place=fluid.CPUPlace())
+    #         fluid.recordio_writer.convert_reader_to_recordio_file(
+    #             "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress)
+
+    def test_resnet(self):
+        import functools
+        batch_size = 2
+        self.check_network_convergence(
+            functools.partial(
+                SE_ResNeXt152Small, batch_size=batch_size),
+            iter=20,
+            batch_size=batch_size)
+
+
+class ModelHyperParams(object):
+    # Dictionary size for source and target language. This model directly uses
+    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
+    # alreay been added, but the <pad> token is not added. Transformer requires
+    # sequences in a mini-batch are padded to have the same length. A <pad> token is
+    # added into the original dictionary in paddle.dateset.wmt16.
+
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # index for <pad> token in source language.
+    src_pad_idx = src_vocab_size
+
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <pad> token in target language.
+    trg_pad_idx = trg_vocab_size
+
+    # position value corresponding to the <pad> token.
+    pos_pad_idx = 0
+
+    # max length of sequences. It should plus 1 to include position
+    # padding token for position encoding.
+    max_length = 50
+
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 1024
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rate used by all dropout layers.
+    dropout = 0.1
+
+
+import numpy as np
+
+
+def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias. Then, convert the numpy
+    data to tensors and return a dict mapping names to tensors.
+    """
+
+    def __pad_batch_data(insts,
+                         pad_idx,
+                         is_target=False,
+                         return_pos=True,
+                         return_attn_bias=True,
+                         return_max_len=True):
+        """
+        Pad the instances to the max sequence length in batch, and generate the
+        corresponding position data and attention bias.
+        """
+        return_list = []
+        max_len = max(len(inst) for inst in insts)
+        inst_data = np.array(
+            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, 1])]
+        if return_pos:
+            inst_pos = np.array([[
+                pos_i + 1 if w_i != pad_idx else 0
+                for pos_i, w_i in enumerate(inst)
+            ] for inst in inst_data])
+
+            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+        if return_attn_bias:
+            if is_target:
+                # This is used to avoid attention on paddings and subsequent
+                # words.
+                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
+                                              max_len))
+                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
+                    [-1, 1, max_len, max_len])
+                slf_attn_bias_data = np.tile(slf_attn_bias_data,
+                                             [1, n_head, 1, 1]) * [-1e9]
+            else:
+                # This is used to avoid attention on paddings.
+                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+                                               (max_len - len(inst))
+                                               for inst in insts])
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                    [1, n_head, max_len, 1])
+            return_list += [slf_attn_bias_data.astype("float32")]
+        if return_max_len:
+            return_list += [max_len]
+        return return_list if len(return_list) > 1 else return_list[0]
+
+    def data_to_tensor(data_list, name_list, input_dict, place):
+        assert len(data_list) == len(name_list)
+        for i in range(len(name_list)):
+            tensor = fluid.LoDTensor()
+            tensor.set(data_list[i], place)
+            input_dict[name_list[i]] = tensor
+
+    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, is_target=False)
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
+    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
+                                [1, 1, trg_max_len, 1]).astype("float32")
+    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
+                                False, False, False)
+    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+
+    return [
+        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
+        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+    ]
+
+
+import transformer_model
+
+
+def transformer():
+    return transformer_model.transformer(
+        ModelHyperParams.src_vocab_size + 1,
+        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
+        ModelHyperParams.n_layer, ModelHyperParams.n_head,
+        ModelHyperParams.d_key, ModelHyperParams.d_value,
+        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
+        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+
+
+class TestTransformer(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        reader = paddle.batch(
+            wmt16.train(ModelHyperParams.src_vocab_size,
+                        ModelHyperParams.trg_vocab_size),
+            batch_size=transformer_model.batch_size)
+
+        with fluid.recordio_writer.create_recordio_writer(
+                "./wmt16.recordio") as writer:
+            for batch in reader():
+                for tensor in prepare_batch_input(
+                        batch, ModelHyperParams.src_pad_idx,
+                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                    t = fluid.LoDTensor()
+                    t.set(tensor, fluid.CPUPlace())
+                    writer.append_tensor(t)
+                writer.complete_append_tensor()
+
+    @unittest.skip("transformer is buggy in multi gpu")
+    def test_main(self):
+        self.check_network_convergence(transformer)
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
index 309ea2b9b7ede442da3ac897ce8d1a4b9aa68233..e4cf4a8bce8a53c0348130716dc18c61ac9a5913 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -14,6 +14,7 @@
 
 import unittest
 import paddle.fluid.core as core
+from paddle.fluid.framework import Program
 
 
 class TestOpDesc(unittest.TestCase):
@@ -186,6 +187,48 @@ class TestBlockDesc(unittest.TestCase):
             all_ops.append(block.op(idx))
         self.assertEqual(all_ops, [op0, op1, op2])
 
+    def test_remove_op(self):
+        program = Program()
+        prog = program.desc
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+
+        op0 = block.append_op()
+        op1 = block.append_op()
+        op2 = block.append_op()
+        op0.set_type("test")
+        op1.set_type("test")
+        op2.set_type("test")
+
+        var0 = block.var("var0")
+        var1 = block.var("var1")
+        var2 = block.var("var2")
+        var3 = block.var("var3")
+        var4 = block.var("var4")
+        var5 = block.var("var5")
+
+        op0.set_input("X", ["var0"])
+        op0.set_output("Y", ["var0"])
+        op1.set_input("X", ["var1", "var2"])
+        op1.set_output("Y", ["var3", "var4"])
+        op2.set_input("X", ["var1"])
+        op2.set_output("Y", ["var4", "var5"])
+
+        program.sync_with_cpp()
+
+        # remove op1, its input var2 and output var3 will be removed at the same time,
+        # but its input var1 and output var4 will not be removed since they are used for op2.
+        block.remove_op(1, 2)
+        program.sync_with_cpp()
+
+        all_ops = []
+        for idx in xrange(0, block.op_size()):
+            all_ops.append(block.op(idx))
+        self.assertEqual(all_ops, [op0, op2])
+        all_vars = block.all_vars()
+        self.assertEqual(set(all_vars), {var0, var1, var4, var5})
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index 24a0074d9b9621d902d12eb8cb29d9b65be22ed3..640264d82f0dc7fa71bf882d5549e30b87b8d7c5 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -15,8 +15,8 @@
 import unittest
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
+import paddle
+import paddle.dataset.mnist as mnist
 
 
 class TestRecordIO(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_recv_op.py
index 854238c6279528d8f3adf173140a47e233134f43..2ebceca7e4b7b824194d94180462870e6cfe6d21 100644
--- a/python/paddle/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_op.py
@@ -23,7 +23,7 @@ import time
 
 
 class TestRecvOp(unittest.TestCase):
-    def test_send(self):
+    def no_test_send(self):
         # Run init_serv in a thread
         place = fluid.CPUPlace()
         p = Process(target=self.init_serv, args=(place, ))
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 11f35c74d41146118525a5efa6c211d528e255fe..f51b5a7e9907294a5b91c920a363830d8b9a7137 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -14,15 +14,19 @@
 
 import unittest
 import numpy as np
+
 from op_test import OpTest
 
 
 class TestReshapeOp(OpTest):
     def setUp(self):
+        ori_shape = (2, 25)
+        new_shape = (5, 10)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [10 * 20]}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -31,12 +35,33 @@ class TestReshapeOp(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInfer(OpTest):
+class TestReshapeOpDimInfer1(OpTest):
     def setUp(self):
+        ori_shape = (5, 10)
+        new_shape = (5, -1, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [4, -1, 5]}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInfer2(OpTest):
+    def setUp(self):
+        ori_shape = (2, 2, 6)
+        new_shape = (2, 0, 3, -1)
+        infered_shape = (2, 2, 3, -1)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -47,10 +72,30 @@ class TestReshapeOpDimInfer(OpTest):
 
 class TestReshapeOpInplace(OpTest):
     def setUp(self):
+        ori_shape = (2, 25)
+        new_shape = (5, 10)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInferInplace1(OpTest):
+    def setUp(self):
+        ori_shape = (5, 10)
+        new_shape = (5, -1, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [10 * 20], 'inplace': True}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -59,12 +104,38 @@ class TestReshapeOpInplace(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInferInplace(OpTest):
+class TestReshapeOpDimInferInplace2(OpTest):
     def setUp(self):
+        ori_shape = (2, 2, 6)
+        new_shape = (2, 0, 3, -1)
+        infered_shape = (2, 2, 3, -1)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpWithInputShape(OpTest):
+    def setUp(self):
+        ori_shape = (6, 5)
+        new_shape = (0, -1, 5)
+        actual_shape = (2, 3, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [4, -1, 5], 'inplace': True}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {
+            "X": np.random.random(ori_shape).astype("float32"),
+            "Shape": np.array(
+                actual_shape, dtype="int32")
+        }
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -73,5 +144,5 @@ class TestReshapeOpDimInferInplace(OpTest):
         self.check_grad(["X"], "Out")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py
index 04884757216bc29a96eb97a6db403c3925472294..2e48ef0e880839f6d5b4e515a174f427a35e7e6f 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -49,6 +49,61 @@ class TestSeqAvgPool(OpTest):
         self.check_grad(["X"], "Out")
 
 
+class TestSeqSumPool(TestSeqAvgPool):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "SUM"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x.sum(axis=0)
+
+
+class TestSeqMaxPool(TestSeqAvgPool):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
+        lod = [[0, 4, 5, 8, 13]]
+        for i in range(4):
+            l = lod[0][i + 1] - lod[0][i]
+            x[lod[0][i] + np.random.randint(l), :] += 2.0
+
+        self.inputs = {'X': (x, lod)}
+
+        out = np.zeros((4, 23)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "MAX"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = np.amax(sub_x, axis=0)
+
+
+class TestSeqSqrtPool(TestSeqAvgPool):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "SQRT"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            len = lod[0][i + 1] - lod[0][i]
+            out[i] = sub_x.sum(axis=0) / np.sqrt(len)
+
+
+class TestSeqLastPool(TestSeqAvgPool):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "LAST"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x[-1, :]
+
+
+class TestSeqFirstPool(TestSeqAvgPool):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "FIRST"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x[0, :]
+
+
 class TestSeqAvgPool2D(TestSeqAvgPool):
     def set_data(self):
         self.op_type = 'sequence_pool'
@@ -68,14 +123,6 @@ class TestSeqAvgPool2D(TestSeqAvgPool):
             out[i] = np.reshape(sub_x.mean(axis=0), (3, 17))
 
 
-class TestSeqSumPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "SUM"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            out[i] = sub_x.sum(axis=0)
-
-
 class TestSeqSumPool2D(TestSeqAvgPool2D):
     def compute(self, x, lod, out):
         self.attrs = {'pooltype': "SUM"}
@@ -84,15 +131,6 @@ class TestSeqSumPool2D(TestSeqAvgPool2D):
             out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
 
 
-class TestSeqSqrtPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "SQRT"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            len = lod[0][i + 1] - lod[0][i]
-            out[i] = sub_x.sum(axis=0) / np.sqrt(len)
-
-
 class TestSeqSqrtPool2D(TestSeqAvgPool2D):
     def compute(self, x, lod, out):
         self.attrs = {'pooltype': "SQRT"}
@@ -108,28 +146,6 @@ class TestSeqSqrtPool2D(TestSeqAvgPool2D):
         self.check_grad(["X"], "Out", max_relative_error=0.06)
 
 
-class TestSeqMaxPool(TestSeqAvgPool):
-    def set_data(self):
-        self.op_type = 'sequence_pool'
-        x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
-        for i in range(4):
-            l = lod[0][i + 1] - lod[0][i]
-            x[lod[0][i] + np.random.randint(l), :] += 2.0
-
-        self.inputs = {'X': (x, lod)}
-
-        out = np.zeros((4, 23)).astype('float32')
-        self.outputs = {'Out': out}
-        return x, lod, out
-
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "MAX"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            out[i] = np.amax(sub_x, axis=0)
-
-
 class TestSeqMaxPool2D(TestSeqAvgPool2D):
     def set_data(self):
         self.op_type = 'sequence_pool'
@@ -151,14 +167,6 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
             out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
 
 
-class TestSeqLastPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "LAST"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            out[i] = sub_x[-1, :]
-
-
 class TestSeqLastPool2D(TestSeqAvgPool2D):
     def compute(self, x, lod, out):
         self.attrs = {'pooltype': "LAST"}
@@ -167,14 +175,6 @@ class TestSeqLastPool2D(TestSeqAvgPool2D):
             out[i] = np.reshape(sub_x[-1, :], (3, 17))
 
 
-class TestSeqFirstPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "FIRST"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            out[i] = sub_x[0, :]
-
-
 class TestSeqFirstPool2D(TestSeqAvgPool2D):
     def compute(self, x, lod, out):
         self.attrs = {'pooltype': "FIRST"}
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
old mode 100755
new mode 100644
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c62792face3c353db1f2e3c77eaf4bd32fbded69
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -0,0 +1,487 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+pos_enc_param_names = (
+    "src_pos_enc_table",
+    "trg_pos_enc_table", )
+
+batch_size = 64
+
+
+def position_encoding_init(n_position, d_pos_vec):
+    """
+    Generate the initial values for the sinusoid position encoding table.
+    """
+    position_enc = np.array([[
+        pos / np.power(10000, 2 * (j // 2) / d_pos_vec)
+        for j in range(d_pos_vec)
+    ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
+    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
+    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
+    return position_enc.astype("float32")
+
+
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs: quries, keys and values should all be 3-D tensors.")
+
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      param_attr=fluid.initializer.Xavier(
+                          uniform=False,
+                          fan_in=d_model * d_key,
+                          fan_out=n_head * d_key),
+                      bias_attr=False,
+                      num_flatten_dims=2)
+        k = layers.fc(input=keys,
+                      size=d_key * n_head,
+                      param_attr=fluid.initializer.Xavier(
+                          uniform=False,
+                          fan_in=d_model * d_key,
+                          fan_out=n_head * d_key),
+                      bias_attr=False,
+                      num_flatten_dims=2)
+        v = layers.fc(input=values,
+                      size=d_value * n_head,
+                      param_attr=fluid.initializer.Xavier(
+                          uniform=False,
+                          fan_in=d_model * d_value,
+                          fan_out=n_head * d_value),
+                      bias_attr=False,
+                      num_flatten_dims=2)
+        return q, k, v
+
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        if n_head == 1:
+            return x
+
+        hidden_size = x.shape[-1]
+        # FIXME(guosheng): Decouple the program desc with batch_size.
+        reshaped = layers.reshape(
+            x=x, shape=[batch_size, -1, n_head, hidden_size // n_head])
+
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # FIXME(guosheng): Decouple the program desc with batch_size.
+        return layers.reshape(
+            x=trans_x,
+            shape=map(int,
+                      [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
+
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+
+        # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op.
+
+        # The current implementation of softmax_op only supports 2D tensor,
+        # consequently it cannot be directly used here.
+        # If to use the reshape_op, Besides, the shape of product inferred in
+        # compile-time is not the actual shape in run-time. It cann't be used
+        # to set the attribute of reshape_op.
+        # So, here define the softmax for temporary solution.
+
+        def __softmax(x, eps=1e-9):
+            exp_out = layers.exp(x=x)
+            sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
+            return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
+
+        scaled_q = layers.scale(x=q, scale=d_model**-0.5)
+        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
+        if dropout_rate:
+            weights = layers.dropout(
+                weights, dropout_prob=dropout_rate, is_test=False)
+        out = layers.matmul(weights, v)
+        return out
+
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+
+    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
+                                                  dropout_rate)
+
+    out = __combine_heads(ctx_multiheads)
+
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         param_attr=fluid.initializer.Xavier(uniform=False),
+                         bias_attr=False,
+                         num_flatten_dims=2)
+    return proj_out
+
+
+def positionwise_feed_forward(x, d_inner_hid, d_hid):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       param_attr=fluid.initializer.Uniform(
+                           low=-(d_hid**-0.5), high=(d_hid**-0.5)),
+                       act="relu")
+    out = layers.fc(input=hidden,
+                    size=d_hid,
+                    num_flatten_dims=2,
+                    param_attr=fluid.initializer.Uniform(
+                        low=-(d_inner_hid**-0.5), high=(d_inner_hid**-0.5)))
+    return out
+
+
+def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out = layers.layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.initializer.Constant(1.),
+                bias_attr=fluid.initializer.Constant(0.))
+        elif cmd == "d":  # add dropout
+            if dropout:
+                out = layers.dropout(out, dropout_prob=dropout, is_test=False)
+    return out
+
+
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+
+
+def prepare_encoder(src_word,
+                    src_pos,
+                    src_vocab_size,
+                    src_emb_dim,
+                    src_pad_idx,
+                    src_max_len,
+                    dropout=0.,
+                    pos_pad_idx=0,
+                    pos_enc_param_name=None):
+    """Add word embeddings and position encodings.
+    The output tensor has a shape of:
+    [batch_size, max_src_length_in_batch, d_model].
+
+    This module is used at the bottom of the encoder stacks.
+    """
+    src_word_emb = layers.embedding(
+        src_word,
+        size=[src_vocab_size, src_emb_dim],
+        padding_idx=src_pad_idx,
+        param_attr=fluid.initializer.Normal(0., 1.))
+    src_pos_enc = layers.embedding(
+        src_pos,
+        size=[src_max_len, src_emb_dim],
+        padding_idx=pos_pad_idx,
+        param_attr=fluid.ParamAttr(
+            name=pos_enc_param_name, trainable=False))
+    enc_input = src_word_emb + src_pos_enc
+
+    # FIXME(guosheng): Decouple the program desc with batch_size.
+    enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim])
+    return layers.dropout(
+        enc_input, dropout_prob=dropout,
+        is_test=False) if dropout else enc_input
+
+
+prepare_encoder = partial(
+    prepare_encoder, pos_enc_param_name=pos_enc_param_names[0])
+prepare_decoder = partial(
+    prepare_encoder, pos_enc_param_name=pos_enc_param_names[1])
+
+
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  dropout_rate=0.):
+    """The encoder layers that can be stacked to form a deep encoder.
+
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    attn_output = multi_head_attention(enc_input, enc_input, enc_input,
+                                       attn_bias, d_key, d_value, d_model,
+                                       n_head, dropout_rate)
+    attn_output = post_process_layer(enc_input, attn_output, "dan",
+                                     dropout_rate)
+    ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model)
+    return post_process_layer(attn_output, ffd_output, "dan", dropout_rate)
+
+
+def encoder(enc_input,
+            attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate=0.):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    for i in range(n_layer):
+        enc_output = encoder_layer(enc_input, attn_bias, n_head, d_key, d_value,
+                                   d_model, d_inner_hid, dropout_rate)
+        enc_input = enc_output
+    return enc_output
+
+
+def decoder_layer(dec_input,
+                  enc_output,
+                  slf_attn_bias,
+                  dec_enc_attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  dropout_rate=0.):
+    """ The layer to be stacked in decoder part.
+
+    The structure of this module is similar to that in the encoder part except
+    a multi-head attention is added to implement encoder-decoder attention.
+    """
+    slf_attn_output = multi_head_attention(
+        dec_input,
+        dec_input,
+        dec_input,
+        slf_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        dropout_rate, )
+    slf_attn_output = post_process_layer(
+        dec_input,
+        slf_attn_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    enc_attn_output = multi_head_attention(
+        slf_attn_output,
+        enc_output,
+        enc_output,
+        dec_enc_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        dropout_rate, )
+    enc_attn_output = post_process_layer(
+        slf_attn_output,
+        enc_attn_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    ffd_output = positionwise_feed_forward(
+        enc_attn_output,
+        d_inner_hid,
+        d_model, )
+    dec_output = post_process_layer(
+        enc_attn_output,
+        ffd_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    return dec_output
+
+
+def decoder(dec_input,
+            enc_output,
+            dec_slf_attn_bias,
+            dec_enc_attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate=0.):
+    """
+    The decoder is composed of a stack of identical decoder_layer layers.
+    """
+    for i in range(n_layer):
+        dec_output = decoder_layer(
+            dec_input,
+            enc_output,
+            dec_slf_attn_bias,
+            dec_enc_attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate, )
+        dec_input = dec_output
+    return dec_output
+
+
+def transformer(
+        src_vocab_size,
+        trg_vocab_size,
+        max_length,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        src_pad_idx,
+        trg_pad_idx,
+        pos_pad_idx, ):
+    file_obj = fluid.layers.open_recordio_file(
+        filename='./wmt16.recordio',
+        shapes=[
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+            [batch_size, n_head, max_length, max_length],
+            [batch_size, n_head, max_length, max_length],
+            [batch_size, n_head, max_length, max_length],
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+        ],
+        dtypes=[
+            'int64',
+            'int64',
+            'int64',
+            'int64',
+            'float32',
+            'float32',
+            'float32',
+            'int64',
+            'float32',
+        ],
+        lod_levels=[0] * 9)
+
+    src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = fluid.layers.read_file(
+        file_obj)
+
+    enc_input = prepare_encoder(
+        src_word,
+        src_pos,
+        src_vocab_size,
+        d_model,
+        src_pad_idx,
+        max_length,
+        dropout_rate, )
+    enc_output = encoder(
+        enc_input,
+        src_slf_attn_bias,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate, )
+
+    dec_input = prepare_decoder(
+        trg_word,
+        trg_pos,
+        trg_vocab_size,
+        d_model,
+        trg_pad_idx,
+        max_length,
+        dropout_rate, )
+    dec_output = decoder(
+        dec_input,
+        enc_output,
+        trg_slf_attn_bias,
+        trg_src_attn_bias,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate, )
+
+    # TODO(guosheng): Share the weight matrix between the embedding layers and
+    # the pre-softmax linear transformation.
+    predict = layers.reshape(
+        x=layers.fc(input=dec_output,
+                    size=trg_vocab_size,
+                    param_attr=fluid.initializer.Xavier(uniform=False),
+                    bias_attr=False,
+                    num_flatten_dims=2),
+        shape=[-1, trg_vocab_size],
+        act="softmax")
+
+    cost = layers.cross_entropy(input=predict, label=gold)
+    weighted_cost = cost * weights
+    return layers.reduce_sum(weighted_cost)
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b059735a924d58714cd88a761eb83143f1192d6
--- /dev/null
+++ b/python/paddle/reader/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+At training and testing time, PaddlePaddle programs need to read data. To ease
+the users' work to write data reading code, we define that
+
+- A *reader* is a function that reads data (from file, network, random number
+  generator, etc) and yields data items.
+- A *reader creator* is a function that returns a reader function.
+- A *reader decorator* is a function, which accepts one or more readers, and
+  returns a reader.
+- A *batch reader* is a function that reads data (from *reader*, file, network,
+  random number generator, etc) and yields a batch of data items.
+
+#####################
+Data Reader Interface
+#####################
+
+Indeed, *data reader* doesn't have to be a function that reads and yields data
+items. It can be any function with no parameter that creates a iterable
+(anything can be used in :code:`for x in iterable`)\:
+
+..  code-block:: python
+
+    iterable = data_reader()
+
+Element produced from the iterable should be a **single** entry of data,
+**not** a mini batch. That entry of data could be a single item, or a tuple of
+items.
+Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
+/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
+array of float32, int, list of int)
+
+An example implementation for single item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image(width, height):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+An example implementation for multiple item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image_and_label(width, height, label):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+
+
+TODO(yuyang18): Should we add whole design doc here?
+"""
+
+import decorator
+from decorator import *
+
+import creator
+
+__all__ = decorator.__all__ + ['creator']
diff --git a/python/paddle/reader/creator.py b/python/paddle/reader/creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c905d959fad4e8c1a8826ce8dc60c5fa834514d
--- /dev/null
+++ b/python/paddle/reader/creator.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Creator package contains some simple reader creator, which could
+be used in user program.
+"""
+
+__all__ = ['np_array', 'text_file', 'recordio']
+
+
+def np_array(x):
+    """
+    Creates a reader that yields elements of x, if it is a
+    numpy vector. Or rows of x, if it is a numpy matrix.
+    Or any sub-hyperplane indexed by the highest dimension.
+
+    :param x: the numpy array to create reader from.
+    :returns: data reader created from x.
+    """
+
+    def reader():
+        if x.ndim < 1:
+            yield x
+
+        for e in x:
+            yield e
+
+    return reader
+
+
+def text_file(path):
+    """
+    Creates a data reader that outputs text line by line from given text file.
+    Trailing new line ('\\\\n') of each line will be removed.
+
+    :path: path of the text file.
+    :returns: data reader of text file
+    """
+
+    def reader():
+        f = open(path, "r")
+        for l in f:
+            yield l.rstrip('\n')
+        f.close()
+
+    return reader
+
+
+def recordio(paths, buf_size=100):
+    """
+    Creates a data reader from given RecordIO file paths separated by ",",
+        glob pattern is supported.
+    :path: path of recordio files, can be a string or a string list.
+    :returns: data reader of recordio files.
+    """
+
+    import recordio as rec
+    import paddle.reader.decorator as dec
+    import cPickle as pickle
+
+    def reader():
+        if isinstance(paths, basestring):
+            path = paths
+        else:
+            path = ",".join(paths)
+        f = rec.reader(path)
+        while True:
+            r = f.read()
+            if r is None:
+                break
+            yield pickle.loads(r)
+        f.close()
+
+    return dec.buffered(reader, buf_size)
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..44a6e344630bb35d28ee29078bf8727053a24bef
--- /dev/null
+++ b/python/paddle/reader/decorator.py
@@ -0,0 +1,405 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
+]
+
+from threading import Thread
+import subprocess
+
+from Queue import Queue
+import itertools
+import random
+import zlib
+
+
+def map_readers(func, *readers):
+    """
+    Creates a data reader that outputs return value of function using
+    output of each data readers as arguments.
+
+    :param func: function to use. The type of func should be (Sample) => Sample
+    :type: callable
+    :param readers: readers whose outputs will be used as arguments of func.
+    :return: the created data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        for e in itertools.imap(func, *rs):
+            yield e
+
+    return reader
+
+
+def shuffle(reader, buf_size):
+    """
+    Creates a data reader whose data output is shuffled.
+
+    Output from the iterator that created by original reader will be
+    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
+    is determined by argument buf_size.
+
+    :param reader: the original reader whose output will be shuffled.
+    :type reader: callable
+    :param buf_size: shuffle buffer size.
+    :type buf_size: int
+
+    :return: the new reader whose output is shuffled.
+    :rtype: callable
+    """
+
+    def data_reader():
+        buf = []
+        for e in reader():
+            buf.append(e)
+            if len(buf) >= buf_size:
+                random.shuffle(buf)
+                for b in buf:
+                    yield b
+                buf = []
+
+        if len(buf) > 0:
+            random.shuffle(buf)
+            for b in buf:
+                yield b
+
+    return data_reader
+
+
+def chain(*readers):
+    """
+    Creates a data reader whose output is the outputs of input data
+    readers chained together.
+
+    If input readers output following data entries:
+    [0, 0, 0]
+    [1, 1, 1]
+    [2, 2, 2]
+    The chained reader will output:
+    [0, 0, 0, 1, 1, 1, 2, 2, 2]
+
+    :param readers: input readers.
+    :return: the new data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+
+        for e in itertools.chain(*rs):
+            yield e
+
+    return reader
+
+
+class ComposeNotAligned(ValueError):
+    pass
+
+
+def compose(*readers, **kwargs):
+    """
+    Creates a data reader whose output is the combination of input readers.
+
+    If input readers output following data entries:
+    (1, 2)    3    (4, 5)
+    The composed reader will output:
+    (1, 2, 3, 4, 5)
+
+    :param readers: readers that will be composed together.
+    :param check_alignment: if True, will check if input readers are aligned
+        correctly. If False, will not check alignment and trailing outputs
+        will be discarded. Defaults to True.
+    :type check_alignment: bool
+
+    :return: the new data reader.
+
+    :raises ComposeNotAligned: outputs of readers are not aligned.
+        Will not raise when check_alignment is set to False.
+    """
+    check_alignment = kwargs.pop('check_alignment', True)
+
+    def make_tuple(x):
+        if isinstance(x, tuple):
+            return x
+        else:
+            return (x, )
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        if not check_alignment:
+            for outputs in itertools.izip(*rs):
+                yield sum(map(make_tuple, outputs), ())
+        else:
+            for outputs in itertools.izip_longest(*rs):
+                for o in outputs:
+                    if o is None:
+                        # None will be not be present if compose is aligned
+                        raise ComposeNotAligned(
+                            "outputs of readers are not aligned.")
+                yield sum(map(make_tuple, outputs), ())
+
+    return reader
+
+
+def buffered(reader, size):
+    """
+    Creates a buffered data reader.
+
+    The buffered data reader will read and save data entries into a
+    buffer. Reading from the buffered data reader will proceed as long
+    as the buffer is not empty.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param size: max buffer size.
+    :type size: int
+
+    :returns: the buffered data reader.
+    """
+
+    class EndSignal():
+        pass
+
+    end = EndSignal()
+
+    def read_worker(r, q):
+        for d in r:
+            q.put(d)
+        q.put(end)
+
+    def data_reader():
+        r = reader()
+        q = Queue(maxsize=size)
+        t = Thread(
+            target=read_worker, args=(
+                r,
+                q, ))
+        t.daemon = True
+        t.start()
+        e = q.get()
+        while e != end:
+            yield e
+            e = q.get()
+
+    return data_reader
+
+
+def firstn(reader, n):
+    """
+    Limit the max number of samples that reader could return.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param n: the max number of samples that return.
+    :type n: int
+    :return: the decorated reader.
+    :rtype: callable
+    """
+
+    # TODO(yuyang18): Check if just drop the reader, could clean the opened
+    # resource or not?
+
+    def firstn_reader():
+        for i, item in enumerate(reader()):
+            if i == n:
+                break
+            yield item
+
+    return firstn_reader
+
+
+class XmapEndSignal():
+    pass
+
+
+def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
+    """
+    Use multiprocess to map samples from reader by a mapper defined by user.
+    And this function contains a buffered decorator.
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param reader: the data reader to read from
+    :type reader: callable
+    :param process_num: process number to handle original sample
+    :type process_num: int
+    :param buffer_size: max buffer size
+    :type buffer_size: int
+    :param order: keep the order of reader
+    :type order: bool
+    :return: the decarated reader
+    :rtype: callable
+    """
+    end = XmapEndSignal()
+
+    # define a worker to read samples from reader to in_queue
+    def read_worker(reader, in_queue):
+        for i in reader():
+            in_queue.put(i)
+        in_queue.put(end)
+
+    # define a worker to read samples from reader to in_queue with order flag
+    def order_read_worker(reader, in_queue):
+        in_order = 0
+        for i in reader():
+            in_queue.put((in_order, i))
+            in_order += 1
+        in_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue
+    def handle_worker(in_queue, out_queue, mapper):
+        sample = in_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            r = mapper(sample)
+            out_queue.put(r)
+            sample = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue by order
+    def order_handle_worker(in_queue, out_queue, mapper, out_order):
+        ins = in_queue.get()
+        while not isinstance(ins, XmapEndSignal):
+            order, sample = ins
+            r = mapper(sample)
+            while order != out_order[0]:
+                pass
+            out_queue.put(r)
+            out_order[0] += 1
+            ins = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    def xreader():
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+
+        sample = out_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            yield sample
+            sample = out_queue.get()
+        finish = 1
+        while finish < process_num:
+            sample = out_queue.get()
+            if isinstance(sample, XmapEndSignal):
+                finish += 1
+            else:
+                yield sample
+
+    return xreader
+
+
+def _buf2lines(buf, line_break="\n"):
+    # FIXME: line_break should be automatically configured.
+    lines = buf.split(line_break)
+    return lines[:-1], lines[-1]
+
+
+class PipeReader:
+    """
+        PipeReader read data by stream from a command, take it's 
+        stdout into a pipe buffer and redirect it to the parser to
+        parse, then yield data as your desired format.
+
+        You can using standard linux command or call another program
+        to read data, from HDFS, Ceph, URL, AWS S3 etc:
+
+        .. code-block:: python
+           cmd = "hadoop fs -cat /path/to/some/file"
+           cmd = "cat sample_file.tar.gz"
+           cmd = "curl http://someurl"
+           cmd = "python print_s3_bucket.py"
+
+        An example:
+
+        .. code-block:: python
+    
+           def example_reader():
+               for f in myfiles:
+                   pr = PipeReader("cat %s"%f)
+                   for l in pr.get_line():
+                       sample = l.split(" ")
+                       yield sample
+    """
+
+    def __init__(self, command, bufsize=8192, file_type="plain"):
+        if not isinstance(command, str):
+            raise TypeError("left_cmd must be a string")
+        if file_type == "gzip":
+            self.dec = zlib.decompressobj(
+                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+        self.file_type = file_type
+        self.bufsize = bufsize
+        self.process = subprocess.Popen(
+            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+
+    def get_line(self, cut_lines=True, line_break="\n"):
+        """
+        :param cut_lines: cut buffer to lines
+        :type cut_lines: bool
+        :param line_break: line break of the file, like \n or \r
+        :type line_break: string
+
+        :return: one line or a buffer of bytes
+        :rtype: string
+        """
+        remained = ""
+        while True:
+            buff = self.process.stdout.read(self.bufsize)
+            if buff:
+                if self.file_type == "gzip":
+                    decomp_buff = self.dec.decompress(buff)
+                elif self.file_type == "plain":
+                    decomp_buff = buff
+                else:
+                    raise TypeError("file_type %s is not allowed" %
+                                    self.file_type)
+
+                if cut_lines:
+                    lines, remained = _buf2lines(''.join(
+                        [remained, decomp_buff]), line_break)
+                    for line in lines:
+                        yield line
+                else:
+                    yield decomp_buff
+            else:
+                break
diff --git a/python/paddle/reader/tests/CMakeLists.txt b/python/paddle/reader/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..107d5912e1567e0c8721987a281272c7feb51e63
--- /dev/null
+++ b/python/paddle/reader/tests/CMakeLists.txt
@@ -0,0 +1,2 @@
+py_test(creator_test SRCS creator_test.py)
+py_test(decorator_test SRCS decorator_test.py)
diff --git a/python/paddle/reader/tests/__init__.py b/python/paddle/reader/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eca2dce114b069bf9b455d77ce670d73b5047fd2
--- /dev/null
+++ b/python/paddle/reader/tests/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/reader/tests/creator_test.py b/python/paddle/reader/tests/creator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4238c12a74759d52eb09f31ce1126cc93dd3489
--- /dev/null
+++ b/python/paddle/reader/tests/creator_test.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright PaddlePaddle contributors. All Rights Reservedd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+import numpy as np
+import paddle.reader.creator
+
+
+class TestNumpyArray(unittest.TestCase):
+    def test_numpy_array(self):
+        l = [[1, 2, 3], [4, 5, 6]]
+        x = np.array(l, np.int32)
+        reader = paddle.reader.creator.np_array(x)
+        for idx, e in enumerate(reader()):
+            self.assertItemsEqual(e, l[idx])
+
+
+class TestTextFile(unittest.TestCase):
+    def test_text_file(self):
+        path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
+        reader = paddle.reader.creator.text_file(path)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
+
+
+class TestRecordIO(unittest.TestCase):
+    def do_test(self, path):
+        reader = paddle.reader.creator.recordio(path)
+        idx = 0
+        for e in reader():
+            if idx == 0:
+                self.assertEqual(e, (1, 2, 3))
+            elif idx == 1:
+                self.assertEqual(e, (4, 5, 6))
+            idx += 1
+        self.assertEqual(idx, 2)
+
+    def test_recordIO(self):
+        self.do_test(
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat"))
+        self.do_test([
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat")
+        ])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bee24d3b6579db5e99ec66931df201fdf9e1af07
--- /dev/null
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -0,0 +1,178 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import unittest
+
+import paddle.reader
+
+
+def reader_creator_10(dur):
+    def reader():
+        for i in range(10):
+            # this invocation helps testing paddle.reader.buffer
+            time.sleep(dur)
+            yield i
+
+    return reader
+
+
+class TestMap(unittest.TestCase):
+    def test_map(self):
+        d = {"h": 0, "i": 1}
+
+        def tokenize(x):
+            return d[x]
+
+        def read():
+            yield "h"
+            yield "i"
+
+        r = paddle.reader.map_readers(tokenize, read)
+        for i, e in enumerate(r()):
+            self.assertEqual(e, i)
+
+
+class TestBuffered(unittest.TestCase):
+    def test_read(self):
+        for size in range(20):
+            b = paddle.reader.buffered(reader_creator_10(0), size)
+            c = 0
+            for i in b():
+                self.assertEqual(i, c)
+                c += 1
+            self.assertEqual(c, 10)
+
+    def test_buffering(self):
+        # read have 30ms delay.
+        b = paddle.reader.buffered(reader_creator_10(0.03), 10)
+        last_time = time.time()
+        for idx, i in enumerate(b()):
+            elapsed_time = time.time() - last_time
+            if i == 0:
+                time.sleep(0.3)
+            else:
+                # read time should be short, meaning already buffered.
+                self.assertLess(elapsed_time, 0.05)
+            last_time = time.time()
+
+
+class TestCompose(unittest.TestCase):
+    def test_compse(self):
+        reader = paddle.reader.compose(
+            reader_creator_10(0), reader_creator_10(0))
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, (idx, idx))
+
+    def test_compose_not_aligned(self):
+        total = 0
+        reader = paddle.reader.compose(
+            paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0))
+        with self.assertRaises(paddle.reader.ComposeNotAligned):
+            for e in reader():
+                total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+    def test_compose_not_aligned_no_check(self):
+        total = 0
+        reader = paddle.reader.compose(
+            paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0),
+            check_alignment=False)
+        for e in reader():
+            total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+
+class TestChain(unittest.TestCase):
+    def test_chain(self):
+        c = paddle.reader.chain(reader_creator_10(0), reader_creator_10(0))
+        idx = 0
+        for e in c():
+            self.assertEqual(e, idx % 10)
+            idx += 1
+        self.assertEqual(idx, 20)
+
+
+class TestShuffle(unittest.TestCase):
+    def test_shuffle(self):
+        case = [(0, True), (1, True), (10, False), (100, False)]
+        a = reader_creator_10(0)
+        for size, checkEq in case:
+            s = paddle.reader.shuffle(a, size)
+            total = 0
+            for idx, e in enumerate(s()):
+                if checkEq:
+                    self.assertEqual(idx, e)
+                total += 1
+            self.assertEqual(total, 10)
+
+
+class TestXmap(unittest.TestCase):
+    def test_xmap(self):
+        def mapper(x):
+            return (x + 1)
+
+        orders = (True, False)
+        thread_nums = (1, 2, 4, 8, 16)
+        buffered_size = (1, 2, 4, 8, 16)
+        for order in orders:
+            for tNum in thread_nums:
+                for size in buffered_size:
+                    reader = paddle.reader.xmap_readers(mapper,
+                                                        reader_creator_10(0),
+                                                        tNum, size, order)
+                    for n in xrange(3):
+                        result = []
+                        for i in reader():
+                            result.append(i)
+                        if not order:
+                            result.sort()
+                        for idx, e in enumerate(result):
+                            self.assertEqual(e, mapper(idx))
+
+
+class TestPipeReader(unittest.TestCase):
+    def test_pipe_reader(self):
+        def example_reader(myfiles):
+            for f in myfiles:
+                pr = paddle.reader.PipeReader("cat %s" % f, bufsize=128)
+                for l in pr.get_line():
+                    yield l
+
+        import tempfile
+
+        records = [str(i) for i in xrange(5)]
+        temp = tempfile.NamedTemporaryFile()
+        try:
+            with open(temp.name, 'w') as f:
+                for r in records:
+                    f.write('%s\n' % r)
+
+            result = []
+            for r in example_reader([temp.name]):
+                result.append(r)
+
+            for idx, e in enumerate(records):
+                self.assertEqual(e, result[idx])
+        finally:
+            # delete the temporary file
+            temp.close()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/reader/tests/test_data_creator.txt b/python/paddle/reader/tests/test_data_creator.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a2a8d47d43868d369083808497697da79e620e31
--- /dev/null
+++ b/python/paddle/reader/tests/test_data_creator.txt
@@ -0,0 +1,3 @@
+0 1
+2 3
+4 5
diff --git a/python/paddle/reader/tests/test_reader_recordio.dat b/python/paddle/reader/tests/test_reader_recordio.dat
new file mode 100644
index 0000000000000000000000000000000000000000..a99a35bb829e066c4845d0b85b96cd1eb3a12491
Binary files /dev/null and b/python/paddle/reader/tests/test_reader_recordio.dat differ
diff --git a/python/paddle/reader/tests/test_recordio_creator.dat b/python/paddle/reader/tests/test_recordio_creator.dat
new file mode 100644
index 0000000000000000000000000000000000000000..17aa89b6796184407e83246d3f342a55a66b4a69
Binary files /dev/null and b/python/paddle/reader/tests/test_recordio_creator.dat differ
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index 00efc01c0592107314f5b23c951706d039d49a88..3683968262266a2d654d2480b828173bc761152b 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -77,7 +77,7 @@ class SoftmaxActivation(BaseActivation):
 
     .. math::
 
-       P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_j} }
+       P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_k} }
     """
 
     def __init__(self):
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index c1acbecd9c313b02d6d33d2d04fd33fc1a8b026e..38056fe0a9496bcb5de76634bbab267e324dc2a4 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -36,7 +36,7 @@ __all__ = [
     'cifar',
     'movielens',
     'conll05',
-    'sentiment'
+    'sentiment',
     'uci_housing',
     'wmt14',
     'wmt16',
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 52f5b947fdec55eea45b9d34eddd576c981fa97c..14b64742fd09bf6c197c5d1aa2354271293df239 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -15,7 +15,7 @@
 import numpy
 import collections
 import topology
-import minibatch
+import paddle
 import cPickle
 
 __all__ = ['infer', 'Inference']
@@ -80,7 +80,7 @@ class Inference(object):
             for each_sample in input:
                 yield each_sample
 
-        reader = minibatch.batch(__reader_impl__, batch_size=batch_size)
+        reader = paddle.batch(__reader_impl__, batch_size=batch_size)
 
         self.__gradient_machine__.start()
         for data_batch in reader():
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 6a2bb8d337b7667aa2b1e3ef0815bb80f6e38d6a..a188a03eb3698c972de92c9807f1bdb71a249330 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -20,7 +20,7 @@ The primary usage shows below.
 
 ..  code-block:: python
 
-    import paddle.v2 as paddle
+    import paddle
 
     img = paddle.layer.data(name='img', type=paddle.data_type.dense_vector(784))
     hidden = paddle.layer.fc(input=img, size=200)
diff --git a/python/setup.py.in b/python/setup.py.in
index 4cb5409524457b7bc5a99c88a0dbbfc8834923fa..08a448934d3248b46618acdef9e1894f94a93893 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -63,20 +63,22 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
 
 packages=['paddle',
           'paddle.utils',
+          'paddle.dataset',
+          'paddle.reader',
           'paddle.fluid',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.layers']
 
-if '${WITH_FLUID}'== 'OFF':
+if '${WITH_FLUID_ONLY}'== 'OFF':
     packages+=['paddle.proto',
                'paddle.trainer',
                'paddle.trainer_config_helpers',
                'paddle.v2',
-               'paddle.v2.dataset',
-               'paddle.v2.reader',
                'paddle.v2.master',
                'paddle.v2.plot',
+               'paddle.v2.reader',
+               'paddle.v2.dataset',
                'py_paddle']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
@@ -87,7 +89,7 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
 
 # the prefix is sys.prefix which should always be usr
 paddle_bins = ''
-if '${WITH_FLUID}'== 'OFF':
+if '${WITH_FLUID_ONLY}'== 'OFF':
     paddle_bin_dir = 'opt/paddle/bin'
     paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
                    '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
@@ -95,7 +97,7 @@ if '${WITH_FLUID}'== 'OFF':
                    '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 
 package_data={'paddle.fluid': ['core.so']}
-if '${WITH_FLUID}'== 'OFF':
+if '${WITH_FLUID_ONLY}'== 'OFF':
     package_data['paddle.v2.master']=['libpaddle_master.so']
     package_data['py_paddle']=['*.py','_swig_paddle.so']
 
@@ -106,7 +108,7 @@ package_dir={
     'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
     'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
 }
-if '${WITH_FLUID}'== 'OFF':
+if '${WITH_FLUID_ONLY}'== 'OFF':
     package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle'
     
 
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
new file mode 100755
index 0000000000000000000000000000000000000000..94d1e23ce716f7f1d723bad5f1f4c60030f19eb7
--- /dev/null
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+TOTAL_ERRORS=0
+
+# The trick to remove deleted files: https://stackoverflow.com/a/2413151
+for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
+    cpplint $file;
+    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+done
+
+exit $TOTAL_ERRORS
+

Trainer Counter	1	10	20	30	40	50	60	70	80	90	100
PaddlePaddle Fluid	-	-	-	-	-	-	-	-	-	-	-
PaddlePaddle v2	-	-	-	-	-	-	-	-	-	-	-
TensorFlow	-	-	-	-	-	-	-	-	-	-	-
Batch Size	32	64	128	256
PaddlePaddle Fluid	15.44	16.32	16.74	16.79
PaddlePaddle v2	15.97	17.04	17.60	17.83
TensorFlow	9.09	9.10	9.24	8.66
Batch Size	32	64	128	256
PaddlePaddle Fluid	190.20	222.15	247.40	258.18
PaddlePaddle v2	170.96	233.71	256.14	329.23
TensorFlow	-	-	-	-
Trainer Count	20	40	80	100
PaddlePaddle Fluid	263.29 (78.64%)	518.80 (77.47%)	836.26 (62.44%)	1019.29 (60.89%)
PaddlePaddle v2 (need more tests)	326.85 (92.85%)	534.58 (75.93%)	853.30 (60.60%)	1041.99 (59.20%)
TensorFlow	-	-	-	-
PServer Count	3	6	10	20
PaddlePaddle Fluid(should fix in next PR)	589.1	592.6	656.4	655.8
PaddlePaddle v2 (need more tests)	593.4	791.3	729.7	821.7
TensorFlow	-	-	-	-
C++	CUDA C++	Go
cc_library	nv_library	go_library
cc_binary	nv_binary	go_binary
cc_test	nv_test	go_test
programming languages	PaddlePaddle
for, while loop	RNN, WhileOp
if, if-else, switch	IfElseOp, SwitchOp
sequential execution	a sequence of layers
programming languages	PaddlePaddle
stack	scope hierarchy
stack frame	scope
push at entering block	push at entering block
pop at leaving block	destroy when minibatch completes
C++ functions/functors	mul	add
C++ operator class	mulOp	addOp	FCOp
Python binding	operator.mul	operator.add	operator.fc
Python function				layer.fc
	TensorFlow	PaddlePaddle
RNN	Support	Support
recursive RNN	Support	Support
padding zeros	Must	No need
blob data type	Tensor	LoDTensor