diff --git a/.gitignore b/.gitignore
index 2badc3bdaa52f2608183fa34393719be66630654..9e3a0b499f9f42856429f3a42bef313ea3df3699 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,12 +25,3 @@ third_party/
 
 # clion workspace.
 cmake-build-*
-
-# generated while compiling
-paddle/pybind/pybind.h
-CMakeFiles
-cmake_install.cmake
-paddle/.timestamp
-python/paddlepaddle.egg-info/
-paddle/fluid/pybind/pybind.h
-python/paddle/version.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 89c620bb2f7ef634fa80b64eec7037e8cb9a190c..6140340890c0e5025eb08209e8ea78df918b4dc0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,4 @@
+repos:
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
     sha: v1.0.1
     hooks:
@@ -25,6 +26,14 @@
         entry: bash ./.clang_format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
+-   repo: local
+    hooks:
+    -   id: cpplint-cpp-source
+        name: cpplint
+        description: Check C++ code style using cpplint.py.
+        entry: bash ./tools/codestyle/cpplint_pre_commit.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
     sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:
diff --git a/.travis.yml b/.travis.yml
index bf6a41d13c4eabc2d8543ab821ce0ff747a061df..929c847bd36d64e79a199b2634ebf68c3225429b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,7 +34,7 @@ addons:
       - automake
       - libtool
       - ccache
-  ssh_known_hosts: 52.76.173.135
+  ssh_known_hosts: 13.229.163.131
 before_install:
   - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ec65bac84b0b0d89123473a8941f80c90f1b339..c649aafeddaf9f28c213d086236c3779d3137d92 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,7 @@ include(simd)
 
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
@@ -52,8 +53,7 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
-# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. 
-option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         OFF)
+option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
@@ -108,7 +108,7 @@ if (WITH_C_API AND WITH_PYTHON)
 endif()
 
 if (WITH_C_API)
-  set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
+  set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
 endif()
 
 if(MOBILE_INFERENCE)
@@ -146,6 +146,7 @@ include(external/cares)
 include(external/grpc)
 include(external/snappy)    # download snappy
 include(external/snappystream)
+include(external/threadpool)
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
@@ -180,6 +181,11 @@ if(WITH_GPU)
     include(cuda)
 endif(WITH_GPU)
 
+if(WITH_AMD_GPU)
+    find_package(HIP)
+    include(hip)
+endif(WITH_AMD_GPU)
+
 if(WITH_MKLML)
     list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
index b619613ea7a5b6e940ec735314e8e47338b2c600..64816098a524f064ec12474a736cd4c721227a70 100644
--- a/benchmark/cluster/README.md
+++ b/benchmark/cluster/README.md
@@ -36,11 +36,41 @@
 - Trainer Count: 100
 - Metrics: mini-batch / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+
+
+
+| Batch Size+ | 32+ | 64+ | 128+ | 256+ | 
+
+
+
+| PaddlePaddle Fluid+ | -+ | -+ | -+ | -+ | 
+
+| PaddlePaddle v2+ | -+ | -+ | -+ | -+ | 
+
+| TensorFlow+ | -+ | -+ | -+ | -+ | 
+
+
 
 ### Measure the Performance for Different PServer Count
 
@@ -48,11 +78,41 @@
 - Batch Size: 64
 - Metrics: mini-batch / sec
 
-| PServer Count | 10 | 20 | 40 | 60 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+
+
+
+| PServer Count+ | 10+ | 20+ | 40+ | 60+ | 
+
+
+
+| PaddlePaddle Fluid+ | -+ | -+ | -+ | -+ | 
+
+| PaddlePaddle v2+ | -+ | -+ | -+ | -+ | 
+
+| TensorFlow+ | -+ | -+ | -+ | -+ | 
+
+
 
 ### Measure Parallel Efficiency By Increasing Trainer Count
 
@@ -67,11 +127,69 @@ The parallel efficiency is:
 
 $E = \div(S, N)$
 
-| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
-| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
-| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+
+
+
+| Trainer Counter+ | 1+ | 10+ | 20+ | 30+ | 40+ | 50+ | 60+ | 70+ | 80+ | 90+ | 100+ | 
+
+
+
+| PaddlePaddle Fluid+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | 
+
+| PaddlePaddle v2+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | 
+
+| TensorFlow+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | -+ | 
+
+
+
 
 ## Reproduce the benchmark
 
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index cd681a1a282d9a26eac1c267bfa26967f8c3c9fd..d56a912b9b03986e32693363f82df05a34b779e9 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -16,11 +16,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 
 - Metrics: samples / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
-| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
-| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |
+
+
+
+| Batch Size+ | 32+ | 64+ | 128+ | 256+ | 
+
+
+
+| PaddlePaddle Fluid+ | 15.44+ | 16.32+ | 16.74+ | 16.79+ | 
+
+| PaddlePaddle v2+ | 15.97+ | 17.04+ | 17.60+ | 17.83+ | 
+
+| TensorFlow+ | 9.09+ | 9.10+ | 9.24+ | 8.66+ | 
+
+
+
 
 ### Different Batch Size
 
@@ -28,12 +58,40 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Trainer Count: 20
 - Metrics: samples / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
-| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
-| TensorFlow | - | - | - | - |
-
+
+
+
+| Batch Size+ | 32+ | 64+ | 128+ | 256+ | 
+
+
+
+| PaddlePaddle Fluid+ | 190.20+ | 222.15+ | 247.40+ | 258.18+ | 
+
+| PaddlePaddle v2+ | 170.96+ | 233.71+ | 256.14+ | 329.23+ | 
+
+| TensorFlow+ | -+ | -+ | -+ | -+ | 
+
+
 
 ### Accelerate Rate
 
@@ -41,11 +99,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples / sec
 
-| Trainer Count | 20 | 40 | 80 | 100 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
-| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
-| TensorFlow | - | - | - | - |
+
+
+
+| Trainer Count+ | 20+ | 40+ | 80+ | 100+ | 
+
+
+
+| PaddlePaddle Fluid+ | 263.29 (78.64%)+ | 518.80 (77.47%)+ | 836.26 (62.44%)+ | 1019.29 (60.89%)+ | 
+
+| PaddlePaddle v2 (need more tests)+ | 326.85 (92.85%)+ | 534.58 (75.93%)+ | 853.30 (60.60%)+ | 1041.99 (59.20%)+ | 
+
+| TensorFlow+ | -+ | -+ | -+ | -+ | 
+
+
+
 
 ### Different Pserver Count
 
@@ -53,11 +141,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples/ sec
 
-| PServer Count | 3 | 6 |10 | 20 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
-| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
-| TensorFlow | - | - | - | - |
+
+
+
+| PServer Count+ | 3+ | 6+ | 10+ | 20+ | 
+
+
+
+| PaddlePaddle Fluid(should fix in next PR)+ | 589.1+ | 592.6+ | 656.4+ | 655.8+ | 
+
+| PaddlePaddle v2 (need more tests)+ | 593.4+ | 791.3+ | 729.7+ | 821.7+ | 
+
+| TensorFlow+ | -+ | -+ | -+ | -+ | 
+
+
+
 
 *The performance gap between Fuild and v2 comes from the network interference.*
 
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 786f224608f7d41c438411de0e09fedbcf2264b8..8b29227cfab2a36d5b9f6d17b837b33da8d2a92e 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -18,12 +18,13 @@ import sys
 import time
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.profiler as profiler
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
 import argparse
 import functools
 import os
+from paddle.fluid import debuger
 
 
 def str2bool(v):
@@ -182,28 +183,27 @@ def main():
             start_time = time.time()
             num_samples = 0
             train_pass_acc.reset()
-            with profiler.profiler("CPU", 'total') as prof:
-                for batch_id, data in enumerate(train_reader()):
-                    ts = time.time()
-                    img_data = np.array(
-                        map(lambda x: x[0].reshape(data_shape), data)).astype(
-                            "float32")
-                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                    y_data = y_data.reshape([-1, 1])
-
-                    loss, acc, b_size = exe.run(
-                        trainer_prog,
-                        feed={"pixel": img_data,
-                              "label": y_data},
-                        fetch_list=[avg_cost, batch_acc, batch_size])
-                    iters += 1
-                    num_samples += len(data)
-                    train_pass_acc.add(value=acc, weight=b_size)
-                    print(
-                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
-                        % (pass_id, iters, loss, acc,
-                           len(data) / (time.time() - ts))
-                    )  # The accuracy is the accumulation of batches, but not the current batch.
+            for batch_id, data in enumerate(train_reader()):
+                ts = time.time()
+                img_data = np.array(
+                    map(lambda x: x[0].reshape(data_shape), data)).astype(
+                        "float32")
+                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = y_data.reshape([-1, 1])
+
+                loss, acc, b_size = exe.run(
+                    trainer_prog,
+                    feed={"pixel": img_data,
+                          "label": y_data},
+                    fetch_list=[avg_cost, batch_acc, batch_size])
+                iters += 1
+                num_samples += len(data)
+                train_pass_acc.add(value=acc, weight=b_size)
+                print(
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
+                    % (pass_id, iters, loss, acc,
+                       len(data) / (time.time() - ts))
+                )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time
             pass_train_acc = train_pass_acc.eval()
@@ -254,9 +254,7 @@ def main():
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
                                                     pserver_prog)
-            print("starting server side startup")
             exe.run(pserver_startup)
-            print("starting parameter server...")
             exe.run(pserver_prog)
         elif training_role == "TRAINER":
             # Parameter initialization
diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py
index 996df0e314b867ea8de618dfd3977f490fbe8372..2d220478acae46566760209dbc012cff316946aa 100644
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ b/benchmark/cluster/vgg16/vgg16_tf.py
@@ -292,14 +292,18 @@ def run_benchmark(cluster_spec, server):
         return np.mean(test_accs)
 
     config = tf.ConfigProto(
-        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+        intra_op_parallelism_threads=1,
+        inter_op_parallelism_threads=1,
+        log_device_placement=True)
     config.gpu_options.allow_growth = True
 
     hooks = [tf.train.StopAtStepHook(last_step=1000000)]
 
     with tf.train.MonitoredTrainingSession(
-            master=server.target, is_chief=(args.task_index == 0),
-            hooks=hooks) as sess:
+            master=server.target,
+            is_chief=(args.task_index == 0),
+            hooks=hooks,
+            config=config) as sess:
         iters, num_samples, start_time = 0, 0, 0.0
         for pass_id in range(args.num_passes):
             # train
diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7a421c10979c3b9d6865a8c0b99a6410e0f46a8
--- /dev/null
+++ b/benchmark/fluid/machine_translation.py
@@ -0,0 +1,379 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""seq2seq model for fluid."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import distutils.util
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.executor import Executor
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--embedding_dim",
+    type=int,
+    default=512,
+    help="The dimension of embedding table. (default: %(default)d)")
+parser.add_argument(
+    "--encoder_size",
+    type=int,
+    default=512,
+    help="The size of encoder bi-rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--decoder_size",
+    type=int,
+    default=512,
+    help="The size of decoder rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--batch_size",
+    type=int,
+    default=16,
+    help="The sequence number of a mini-batch data. (default: %(default)d)")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    "--dict_size",
+    type=int,
+    default=30000,
+    help="The dictionary capacity. Dictionaries of source sequence and "
+    "target dictionary have same capacity. (default: %(default)d)")
+parser.add_argument(
+    "--pass_num",
+    type=int,
+    default=2,
+    help="The pass number to train. (default: %(default)d)")
+parser.add_argument(
+    "--learning_rate",
+    type=float,
+    default=0.0002,
+    help="Learning rate used to train the model. (default: %(default)f)")
+parser.add_argument(
+    "--infer_only", action='store_true', help="If set, run forward only.")
+parser.add_argument(
+    "--beam_size",
+    type=int,
+    default=3,
+    help="The width for beam searching. (default: %(default)d)")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    "--max_length",
+    type=int,
+    default=250,
+    help="The maximum length of sequence when doing generation. "
+    "(default: %(default)d)")
+parser.add_argument(
+    '--with_test',
+    action='store_true',
+    help='If set, test the testset during training.')
+
+
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+    def linear(inputs):
+        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+    cell_t = fluid.layers.sums(input=[
+        fluid.layers.elementwise_mul(
+            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+                x=input_gate, y=cell_tilde)
+    ])
+
+    hidden_t = fluid.layers.elementwise_mul(
+        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+    return hidden_t, cell_t
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size, max_length):
+    """Construct a seq2seq network."""
+
+    def bi_lstm_encoder(input_seq, gate_size):
+        # Linear transformation part for input gate, output gate, forget gate
+        # and cell activation vectors need be done outside of dynamic_lstm.
+        # So the output size is 4 times of gate_size.
+        input_forward_proj = fluid.layers.fc(input=input_seq,
+                                             size=gate_size * 4,
+                                             act=None,
+                                             bias_attr=False)
+        forward, _ = fluid.layers.dynamic_lstm(
+            input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
+        input_reversed_proj = fluid.layers.fc(input=input_seq,
+                                              size=gate_size * 4,
+                                              act=None,
+                                              bias_attr=False)
+        reversed, _ = fluid.layers.dynamic_lstm(
+            input=input_reversed_proj,
+            size=gate_size * 4,
+            is_reverse=True,
+            use_peepholes=False)
+        return forward, reversed
+
+    src_word_idx = fluid.layers.data(
+        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    src_embedding = fluid.layers.embedding(
+        input=src_word_idx,
+        size=[source_dict_dim, embedding_dim],
+        dtype='float32')
+
+    src_forward, src_reversed = bi_lstm_encoder(
+        input_seq=src_embedding, gate_size=encoder_size)
+
+    encoded_vector = fluid.layers.concat(
+        input=[src_forward, src_reversed], axis=1)
+
+    encoded_proj = fluid.layers.fc(input=encoded_vector,
+                                   size=decoder_size,
+                                   bias_attr=False)
+
+    backward_first = fluid.layers.sequence_pool(
+        input=src_reversed, pool_type='first')
+
+    decoder_boot = fluid.layers.fc(input=backward_first,
+                                   size=decoder_size,
+                                   bias_attr=False,
+                                   act='tanh')
+
+    def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
+                                    decoder_boot, decoder_size):
+        def simple_attention(encoder_vec, encoder_proj, decoder_state):
+            decoder_state_proj = fluid.layers.fc(input=decoder_state,
+                                                 size=decoder_size,
+                                                 bias_attr=False)
+            decoder_state_expand = fluid.layers.sequence_expand(
+                x=decoder_state_proj, y=encoder_proj)
+            concated = fluid.layers.concat(
+                input=[encoder_proj, decoder_state_expand], axis=1)
+            attention_weights = fluid.layers.fc(input=concated,
+                                                size=1,
+                                                act='tanh',
+                                                bias_attr=False)
+            attention_weights = fluid.layers.sequence_softmax(
+                input=attention_weights)
+            weigths_reshape = fluid.layers.reshape(
+                x=attention_weights, shape=[-1])
+            scaled = fluid.layers.elementwise_mul(
+                x=encoder_vec, y=weigths_reshape, axis=0)
+            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+            return context
+
+        rnn = fluid.layers.DynamicRNN()
+
+        cell_init = fluid.layers.fill_constant_batch_size_like(
+            input=decoder_boot,
+            value=0.0,
+            shape=[-1, decoder_size],
+            dtype='float32')
+        cell_init.stop_gradient = False
+
+        with rnn.block():
+            current_word = rnn.step_input(target_embedding)
+            encoder_vec = rnn.static_input(encoder_vec)
+            encoder_proj = rnn.static_input(encoder_proj)
+            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+            cell_mem = rnn.memory(init=cell_init)
+            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
+            decoder_inputs = fluid.layers.concat(
+                input=[context, current_word], axis=1)
+            h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
+            rnn.update_memory(hidden_mem, h)
+            rnn.update_memory(cell_mem, c)
+            out = fluid.layers.fc(input=h,
+                                  size=target_dict_dim,
+                                  bias_attr=True,
+                                  act='softmax')
+            rnn.output(out)
+        return rnn()
+
+    if not is_generating:
+        trg_word_idx = fluid.layers.data(
+            name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+        trg_embedding = fluid.layers.embedding(
+            input=trg_word_idx,
+            size=[target_dict_dim, embedding_dim],
+            dtype='float32')
+
+        prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
+                                                 encoded_proj, decoder_boot,
+                                                 decoder_size)
+        label = fluid.layers.data(
+            name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
+
+        return avg_cost, feeding_list
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    lod_t = core.LoDTensor()
+    lod_t.set(flattened_data, place)
+    lod_t.set_lod([lod])
+    return lod_t, lod[-1]
+
+
+def lodtensor_to_ndarray(lod_tensor):
+    dims = lod_tensor.get_dims()
+    ndarray = np.zeros(shape=dims).astype('float32')
+    for i in xrange(np.product(dims)):
+        ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+    return ndarray
+
+
+def train():
+    avg_cost, feeding_list = seq_to_seq_net(
+        args.embedding_dim,
+        args.encoder_size,
+        args.decoder_size,
+        args.dict_size,
+        args.dict_size,
+        False,
+        beam_size=args.beam_size,
+        max_length=args.max_length)
+
+    # clone from default main program
+    inference_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    train_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    def do_validation():
+        total_loss = 0.0
+        count = 0
+        for batch_id, data in enumerate(test_batch_generator()):
+            src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0]
+            trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0]
+            lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0]
+
+            fetch_outs = exe.run(inference_program,
+                                 feed={
+                                     feeding_list[0]: src_seq,
+                                     feeding_list[1]: trg_seq,
+                                     feeding_list[2]: lbl_seq
+                                 },
+                                 fetch_list=[avg_cost],
+                                 return_numpy=False)
+
+            total_loss += lodtensor_to_ndarray(fetch_outs[0])[0]
+            count += 1
+
+        return total_loss / count
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in xrange(args.pass_num):
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_batch_generator()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
+            num_samples += word_num
+            trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
+            num_samples += word_num
+            lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
+
+            fetch_outs = exe.run(framework.default_main_program(),
+                                 feed={
+                                     feeding_list[0]: src_seq,
+                                     feeding_list[1]: trg_seq,
+                                     feeding_list[2]: lbl_seq
+                                 },
+                                 fetch_list=[avg_cost])
+
+            iters += 1
+            loss = np.array(fetch_outs[0])
+            print(
+                "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        if args.with_test:
+            test_loss = do_validation()
+        exit(0)
+
+
+def infer():
+    pass
+
+
+def print_arguments(args):
+    print('----------- seq2seq Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    if args.infer_only:
+        infer()
+    else:
+        train()
diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc10ac2ec195acc9a5693718141ddb32417dfb71
--- /dev/null
+++ b/benchmark/fluid/mnist.py
@@ -0,0 +1,224 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+SEED = 1
+DTYPE = "float32"
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("mnist model benchmark.")
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=35, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=5, help='The number of passes.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    args = parser.parse_args()
+    return args
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    test_pass_acc = fluid.average.WeightedAverage()
+    for batch_id, data in enumerate(test_reader()):
+        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
+                                data)).astype(DTYPE)
+        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+        y_data = y_data.reshape([len(y_data), 1])
+
+        acc, weight = exe.run(inference_program,
+                              feed={"pixel": img_data,
+                                    "label": y_data},
+                              fetch_list=[batch_acc, batch_size_tensor])
+        test_pass_acc.add(value=acc, weight=weight)
+        pass_acc = test_pass_acc.eval()
+    return pass_acc
+
+
+def run_benchmark(model, args):
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+    start_time = time.time()
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+    opt.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+
+    accuracy = fluid.metrics.Accuracy()
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            img_data = np.array(
+                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([len(y_data), 1])
+
+            outs = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+            accuracy.update(value=outs[1], weight=outs[2])
+            iters += 1
+            num_samples += len(y_data)
+            loss = np.array(outs[0])
+            acc = np.array(outs[1])
+            train_losses.append(loss)
+            train_accs.append(acc)
+            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
+                  (pass_id, iters, loss, acc))
+
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        if args.with_test:
+            test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
+                                     inference_program)
+        exit(0)
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('----------- mnist Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if args.use_nvprof and args.device == 'GPU':
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            run_benchmark(cnn_model, args)
+    else:
+        run_benchmark(cnn_model, args)
diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1af5eaf6b46be47cb6b778cedcf53830c201ef39
--- /dev/null
+++ b/benchmark/fluid/resnet.py
@@ -0,0 +1,313 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import functools
+import numpy as np
+import time
+
+import cProfile, pstats, StringIO
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Convolution model benchmark.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=['resnet_imagenet', 'resnet_cifar10'],
+        default='resnet_imagenet',
+        help='The model architecture.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='use real data or fake data')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    args = parser.parse_args()
+    return args
+
+
+def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride):
+    short = shortcut(input, ch_out, stride)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride):
+    short = shortcut(input, ch_out * 4, stride)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+
+
+def run_benchmark(model, args):
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+    else:
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+
+    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    predict = model(input, class_dim)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+    opts = optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    def test(exe):
+        test_accuracy = fluid.average.WeightedAverage()
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(dshape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            acc, weight = exe.run(inference_program,
+                                  feed={"data": img_data,
+                                        "label": y_data},
+                                  fetch_list=[batch_acc, batch_size_tensor])
+            test_accuracy.add(value=acc, weight=weight)
+
+        return test_accuracy.eval()
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    accuracy = fluid.average.WeightedAverage()
+    if args.use_fake_data:
+        data = train_reader().next()
+        image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
+            'float32')
+        label = np.array(map(lambda x: x[1], data)).astype('int64')
+        label = label.reshape([-1, 1])
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            if not args.use_fake_data:
+                image = np.array(map(lambda x: x[0].reshape(dshape),
+                                     data)).astype('float32')
+                label = np.array(map(lambda x: x[1], data)).astype('int64')
+                label = label.reshape([-1, 1])
+            loss, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={'data': image,
+                      'label': label},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+            iters += 1
+            num_samples += len(label)
+            accuracy.add(value=acc, weight=weight)
+            train_losses.append(loss)
+            train_accs.append(acc)
+            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
+                  (pass_id, iters, loss, acc))
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        exit(0)
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('----------- resnet Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    model_map = {
+        'resnet_imagenet': resnet_imagenet,
+        'resnet_cifar10': resnet_cifar10
+    }
+    args = parse_args()
+    print_arguments(args)
+    if args.data_format == 'NHWC':
+        raise ValueError('Only support NCHW data_format now.')
+    if args.use_nvprof and args.device == 'GPU':
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            run_benchmark(model_map[args.model], args)
+    else:
+        run_benchmark(model_map[args.model], args)
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f6dfd20bf2ee0b668b6d4238d4511253b2233035
--- /dev/null
+++ b/benchmark/fluid/run.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# This script benchmarking the PaddlePaddle Fluid on
+# single thread single GPU.
+
+#export FLAGS_fraction_of_gpu_memory_to_use=0.0
+export CUDNN_PATH=/paddle/cudnn_v5
+
+# disable openmp and mkl parallel
+#https://github.com/PaddlePaddle/Paddle/issues/7199
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+        export OMP_DYNAMIC="FALSE"
+    fi
+else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+fi
+# disable multi-gpu if have more than one
+export CUDA_VISIBLE_DEVICES=0
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
+
+# only query the gpu used
+nohup stdbuf -oL nvidia-smi \
+      --id=${CUDA_VISIBLE_DEVICES} \
+      --query-gpu=timestamp \
+      --query-compute-apps=pid,process_name,used_memory \
+      --format=csv \
+      --filename=mem.log  \
+      -l 1 &
+# mnist
+# mnist gpu mnist 128
+FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=500 \
+               2>&1 | tee -a mnist_gpu_128.log
+
+# vgg16
+# gpu cifar10 128
+FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a vgg16_gpu_128.log
+
+# flowers gpu  128
+FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+               --device=GPU \
+               --batch_size=32 \
+               --data_set=flowers \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a vgg16_gpu_flowers_32.log
+
+# resnet50
+# resnet50 gpu cifar10 128
+FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+               --device=GPU \
+               --batch_size=128 \
+               --data_set=cifar10 \
+               --model=resnet_cifar10 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a resnet50_gpu_128.log
+
+# resnet50 gpu flowers 64
+FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+               --device=GPU \
+               --batch_size=64 \
+               --data_set=flowers \
+               --model=resnet_imagenet \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a resnet50_gpu_flowers_64.log
+
+# lstm
+# lstm gpu imdb 32 # tensorflow only support batch=32
+FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+               --device=GPU \
+               --batch_size=32 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               --hidden_dim=512 \
+               --emb_dim=512 \
+               --crop_size=1500 \
+               2>&1 | tee -a lstm_gpu_32.log
+
+# seq2seq
+# seq2seq gpu wmb 128
+FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a lstm_gpu_128.log
diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fcbdd64af9dc196c9d5b2b82ce4213478ea1418
--- /dev/null
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@@ -0,0 +1,236 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import cPickle
+import os
+import random
+import time
+
+import numpy
+import paddle.v2 as paddle
+import paddle.v2.dataset.imdb as imdb
+import paddle.fluid as fluid
+from paddle.v2 import batch
+import paddle.fluid.profiler as profiler
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--emb_dim',
+        type=int,
+        default=512,
+        help='Dimension of embedding table. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=512,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=100,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='CPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--crop_size',
+        type=int,
+        default=int(os.environ.get('CROP_SIZE', '1500')),
+        help='The max sentence length of input. Since this model use plain RNN,'
+        ' Gradient could be explored if sentence is too long')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    args = parser.parse_args()
+    return args
+
+
+word_dict = imdb.word_dict()
+
+
+def crop_sentence(reader, crop_size):
+    unk_value = word_dict['']
+
+    def __impl__():
+        for item in reader():
+            if len([x for x in item[0] if x != unk_value]) < crop_size:
+                yield item
+
+    return __impl__
+
+
+def main():
+    args = parse_args()
+    lstm_size = args.hidden_dim
+
+    data = fluid.layers.data(
+        name="words", shape=[1], lod_level=1, dtype='int64')
+    sentence = fluid.layers.embedding(
+        input=data, size=[len(word_dict), args.emb_dim])
+
+    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(
+                ipt,
+                hidden,
+                size, ):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            gate = fluid.layers.sums(input=[gate0, gate1])
+            return gate
+
+        forget_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        input_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        output_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        cell_gate = fluid.layers.tanh(
+            x=gate_common(word, prev_hidden, lstm_size))
+
+        cell = fluid.layers.sums(input=[
+            fluid.layers.elementwise_mul(
+                x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
+                    x=input_gate, y=cell_gate)
+        ])
+
+        hidden = fluid.layers.elementwise_mul(
+            x=output_gate, y=fluid.layers.tanh(x=cell))
+
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_pool(rnn(), 'last')
+    logit = fluid.layers.fc(input=last, size=2, act='softmax')
+    loss = fluid.layers.cross_entropy(
+        input=logit,
+        label=fluid.layers.data(
+            name='label', shape=[1], dtype='int64'))
+    loss = fluid.layers.mean(x=loss)
+
+    # add acc
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                shape=[1], dtype='int64'), total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    adam = fluid.optimizer.Adam()
+    adam.minimize(loss)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    train_reader = batch(
+        paddle.reader.shuffle(
+            crop_sentence(imdb.train(word_dict), args.crop_size),
+            buf_size=25000),
+        batch_size=args.batch_size)
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            tensor_words = to_lodtensor([x[0] for x in data], place)
+            label = numpy.array([x[1] for x in data]).astype("int64")
+            label = label.reshape((-1, 1))
+            loss_np, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={"words": tensor_words,
+                      "label": label},
+                fetch_list=[loss, batch_acc, batch_size_tensor])
+            iters += 1
+            for x in data:
+                num_samples += len(x[0])
+            print(
+                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                (pass_id, iters, loss_np, acc)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        exit(0)
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def print_arguments(args):
+    print('----------- lstm Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    main()
diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d990eff62ec368dc7033f55cc0862fa974a64e0
--- /dev/null
+++ b/benchmark/fluid/vgg.py
@@ -0,0 +1,224 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import argparse
+import functools
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NCHW',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, now only support NCHW.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+parser.add_argument(
+    '--with_test',
+    action='store_true',
+    help='If set, test the testset during training.')
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def main():
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    opts = optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    # test
+    def test(exe):
+        test_accuracy = fluid.average.WeightedAverage()
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            acc, weight = exe.run(inference_program,
+                                  feed={"pixel": img_data,
+                                        "label": y_data},
+                                  fetch_list=[batch_acc, batch_size_tensor])
+            test_accuracy.add(value=acc, weight=weight)
+        return test_accuracy.eval()
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    accuracy = fluid.average.WeightedAverage()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            loss, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+            accuracy.add(value=acc, weight=weight)
+            iters += 1
+            num_samples += len(y_data)
+            print(
+                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                (pass_id, iters, loss, acc)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        # pass_train_acc = accuracy.eval()
+        train_losses.append(loss)
+        train_accs.append(acc)
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        exit(0)
+
+
+def print_arguments():
+    print('----------- vgg Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    print_arguments()
+    main()
diff --git a/benchmark/tensorflow/machine_translation.py b/benchmark/tensorflow/machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f77dce98353af53803246be8dc61063836b7867
--- /dev/null
+++ b/benchmark/tensorflow/machine_translation.py
@@ -0,0 +1,626 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.framework import dtypes
+from tensorflow.python.layers.core import Dense
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops.rnn_cell_impl import RNNCell, BasicLSTMCell
+from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple
+from tensorflow.contrib.rnn.python.ops import core_rnn_cell
+from tensorflow.python.ops import array_ops
+from tensorflow.python.util import nest
+import tensorflow.contrib.seq2seq as seq2seq
+from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
+import numpy as np
+import os
+import argparse
+import time
+
+import paddle.v2 as paddle
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--embedding_dim",
+    type=int,
+    default=512,
+    help="The dimension of embedding table. (default: %(default)d)")
+parser.add_argument(
+    "--encoder_size",
+    type=int,
+    default=512,
+    help="The size of encoder bi-rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--decoder_size",
+    type=int,
+    default=512,
+    help="The size of decoder rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--batch_size",
+    type=int,
+    default=128,
+    help="The sequence number of a mini-batch data. (default: %(default)d)")
+parser.add_argument(
+    "--dict_size",
+    type=int,
+    default=30000,
+    help="The dictionary capacity. Dictionaries of source sequence and "
+    "target dictionary have same capacity. (default: %(default)d)")
+parser.add_argument(
+    "--max_time_steps",
+    type=int,
+    default=81,
+    help="Max number of time steps for sequence. (default: %(default)d)")
+parser.add_argument(
+    "--pass_num",
+    type=int,
+    default=10,
+    help="The pass number to train. (default: %(default)d)")
+parser.add_argument(
+    "--learning_rate",
+    type=float,
+    default=0.0002,
+    help="Learning rate used to train the model. (default: %(default)f)")
+parser.add_argument(
+    "--infer_only", action='store_true', help="If set, run forward only.")
+parser.add_argument(
+    "--beam_size",
+    type=int,
+    default=3,
+    help="The width for beam searching. (default: %(default)d)")
+parser.add_argument(
+    "--max_generation_length",
+    type=int,
+    default=250,
+    help="The maximum length of sequence when doing generation. "
+    "(default: %(default)d)")
+parser.add_argument(
+    "--save_freq",
+    type=int,
+    default=500,
+    help="Save model checkpoint every this interation. (default: %(default)d)")
+parser.add_argument(
+    "--model_dir",
+    type=str,
+    default='./checkpoint',
+    help="Path to save model checkpoints. (default: %(default)d)")
+
+_Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
+
+START_TOKEN_IDX = 0
+END_TOKEN_IDX = 1
+
+
+class LSTMCellWithSimpleAttention(RNNCell):
+    """Add attention mechanism to BasicLSTMCell.
+    This class is a wrapper based on tensorflow's `BasicLSTMCell`.
+    """
+
+    def __init__(self,
+                 num_units,
+                 encoder_vector,
+                 encoder_proj,
+                 source_sequence_length,
+                 forget_bias=1.0,
+                 state_is_tuple=True,
+                 activation=None,
+                 reuse=None):
+        super(LSTMCellWithSimpleAttention, self).__init__(_reuse=reuse)
+        if not state_is_tuple:
+            logging.warn("%s: Using a concatenated state is slower and will "
+                         "soon be deprecated. Use state_is_tuple=True.", self)
+        self._num_units = num_units
+        # set padding part to 0
+        self._encoder_vector = self._reset_padding(encoder_vector,
+                                                   source_sequence_length)
+        self._encoder_proj = self._reset_padding(encoder_proj,
+                                                 source_sequence_length)
+        self._forget_bias = forget_bias
+        self._state_is_tuple = state_is_tuple
+        self._activation = activation or math_ops.tanh
+        self._linear = None
+
+    @property
+    def state_size(self):
+        return (LSTMStateTuple(self._num_units, self._num_units) \
+                if self._state_is_tuple else 2 * self._num_units)
+
+    @property
+    def output_size(self):
+        return self._num_units
+
+    def zero_state(self, batch_size, dtype):
+        state_size = self.state_size
+        if hasattr(self, "_last_zero_state"):
+            (last_state_size, last_batch_size, last_dtype,
+             last_output) = getattr(self, "_last_zero_state")
+            if (last_batch_size == batch_size and last_dtype == dtype and
+                    last_state_size == state_size):
+                return last_output
+        with ops.name_scope(
+                type(self).__name__ + "ZeroState", values=[batch_size]):
+            output = _zero_state_tensors(state_size, batch_size, dtype)
+        self._last_zero_state = (state_size, batch_size, dtype, output)
+        return output
+
+    def call(self, inputs, state):
+        sigmoid = math_ops.sigmoid
+        # Parameters of gates are concatenated into one multiply for efficiency.
+        if self._state_is_tuple:
+            c, h = state
+        else:
+            c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
+
+        # get context from encoder outputs
+        context = self._simple_attention(self._encoder_vector,
+                                         self._encoder_proj, h)
+
+        if self._linear is None:
+            self._linear = _Linear([inputs, context, h], 4 * self._num_units,
+                                   True)
+        # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+        i, j, f, o = array_ops.split(
+            value=self._linear([inputs, context, h]),
+            num_or_size_splits=4,
+            axis=1)
+
+        new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
+                 self._activation(j))
+        new_h = self._activation(new_c) * sigmoid(o)
+
+        if self._state_is_tuple:
+            new_state = LSTMStateTuple(new_c, new_h)
+        else:
+            new_state = array_ops.concat([new_c, new_h], 1)
+        return new_h, new_state
+
+    def _simple_attention(self, encoder_vec, encoder_proj, decoder_state):
+        """Implement the attention function.
+        The implementation has the same logic to the fluid decoder.
+        """
+        decoder_state_proj = tf.contrib.layers.fully_connected(
+            inputs=decoder_state,
+            num_outputs=self._num_units,
+            activation_fn=None,
+            biases_initializer=None)
+        decoder_state_expand = tf.tile(
+            tf.expand_dims(
+                input=decoder_state_proj, axis=1),
+            [1, tf.shape(encoder_proj)[1], 1])
+        concated = tf.concat([decoder_state_expand, encoder_proj], axis=2)
+        # need reduce the first dimension
+        attention_weights = tf.contrib.layers.fully_connected(
+            inputs=tf.reshape(
+                concated, shape=[-1, self._num_units * 2]),
+            num_outputs=1,
+            activation_fn=tf.nn.tanh,
+            biases_initializer=None)
+        attention_weights_reshaped = tf.reshape(
+            attention_weights, shape=[tf.shape(encoder_vec)[0], -1, 1])
+        # normalize the attention weights using softmax
+        attention_weights_normed = tf.nn.softmax(
+            attention_weights_reshaped, dim=1)
+        scaled = tf.multiply(attention_weights_normed, encoder_vec)
+        context = tf.reduce_sum(scaled, axis=1)
+        return context
+
+    def _reset_padding(self,
+                       memory,
+                       memory_sequence_length,
+                       check_inner_dims_defined=True):
+        """Reset the padding part for encoder inputs.
+        This funtion comes from tensorflow's `_prepare_memory` function.
+        """
+        memory = nest.map_structure(
+                lambda m: ops.convert_to_tensor(m, name="memory"), memory)
+        if memory_sequence_length is not None:
+            memory_sequence_length = ops.convert_to_tensor(
+                memory_sequence_length, name="memory_sequence_length")
+        if check_inner_dims_defined:
+
+            def _check_dims(m):
+                if not m.get_shape()[2:].is_fully_defined():
+                    raise ValueError(
+                        "Expected memory %s to have fully defined inner dims, "
+                        "but saw shape: %s" % (m.name, m.get_shape()))
+
+            nest.map_structure(_check_dims, memory)
+        if memory_sequence_length is None:
+            seq_len_mask = None
+        else:
+            seq_len_mask = array_ops.sequence_mask(
+                memory_sequence_length,
+                maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
+                dtype=nest.flatten(memory)[0].dtype)
+            seq_len_batch_size = (memory_sequence_length.shape[0].value or
+                                  array_ops.shape(memory_sequence_length)[0])
+
+        def _maybe_mask(m, seq_len_mask):
+            rank = m.get_shape().ndims
+            rank = rank if rank is not None else array_ops.rank(m)
+            extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
+            m_batch_size = m.shape[0].value or array_ops.shape(m)[0]
+            if memory_sequence_length is not None:
+                message = ("memory_sequence_length and memory tensor "
+                           "batch sizes do not match.")
+                with ops.control_dependencies([
+                        check_ops.assert_equal(
+                            seq_len_batch_size, m_batch_size, message=message)
+                ]):
+                    seq_len_mask = array_ops.reshape(
+                        seq_len_mask,
+                        array_ops.concat(
+                            (array_ops.shape(seq_len_mask), extra_ones), 0))
+                return m * seq_len_mask
+            else:
+                return m
+
+        return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask),
+                                  memory)
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size,
+                   max_generation_length):
+    src_word_idx = tf.placeholder(tf.int32, shape=[None, None])
+    src_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
+
+    src_embedding_weights = tf.get_variable("source_word_embeddings",
+                                            [source_dict_dim, embedding_dim])
+    src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx)
+
+    src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
+    src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
+    # no peephole
+    encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
+        cell_fw=src_forward_cell,
+        cell_bw=src_reversed_cell,
+        inputs=src_embedding,
+        sequence_length=src_sequence_length,
+        dtype=tf.float32)
+
+    # concat the forward outputs and backward outputs
+    encoded_vec = tf.concat(encoder_outputs, axis=2)
+
+    # project the encoder outputs to size of decoder lstm
+    encoded_proj = tf.contrib.layers.fully_connected(
+        inputs=tf.reshape(
+            encoded_vec, shape=[-1, embedding_dim * 2]),
+        num_outputs=decoder_size,
+        activation_fn=None,
+        biases_initializer=None)
+    encoded_proj_reshape = tf.reshape(
+        encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size])
+
+    # get init state for decoder lstm's H
+    backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1])
+    decoder_boot = tf.contrib.layers.fully_connected(
+        inputs=tf.reshape(
+            backword_first, shape=[-1, embedding_dim]),
+        num_outputs=decoder_size,
+        activation_fn=tf.nn.tanh,
+        biases_initializer=None)
+
+    # prepare the initial state for decoder lstm
+    cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32)
+    initial_state = LSTMStateTuple(cell_init, decoder_boot)
+
+    # create decoder lstm cell
+    decoder_cell = LSTMCellWithSimpleAttention(
+        decoder_size,
+        encoded_vec
+        if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size),
+        encoded_proj_reshape if not is_generating else
+        seq2seq.tile_batch(encoded_proj_reshape, beam_size),
+        src_sequence_length if not is_generating else
+        seq2seq.tile_batch(src_sequence_length, beam_size),
+        forget_bias=0.0)
+
+    output_layer = Dense(target_dict_dim, name='output_projection')
+
+    if not is_generating:
+        trg_word_idx = tf.placeholder(tf.int32, shape=[None, None])
+        trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
+        trg_embedding_weights = tf.get_variable(
+            "target_word_embeddings", [target_dict_dim, embedding_dim])
+        trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights,
+                                               trg_word_idx)
+
+        training_helper = seq2seq.TrainingHelper(
+            inputs=trg_embedding,
+            sequence_length=trg_sequence_length,
+            time_major=False,
+            name='training_helper')
+
+        training_decoder = seq2seq.BasicDecoder(
+            cell=decoder_cell,
+            helper=training_helper,
+            initial_state=initial_state,
+            output_layer=output_layer)
+
+        # get the max length of target sequence
+        max_decoder_length = tf.reduce_max(trg_sequence_length)
+
+        decoder_outputs_train, _, _ = seq2seq.dynamic_decode(
+            decoder=training_decoder,
+            output_time_major=False,
+            impute_finished=True,
+            maximum_iterations=max_decoder_length)
+
+        decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output)
+        decoder_pred_train = tf.argmax(
+            decoder_logits_train, axis=-1, name='decoder_pred_train')
+        masks = tf.sequence_mask(
+            lengths=trg_sequence_length,
+            maxlen=max_decoder_length,
+            dtype=tf.float32,
+            name='masks')
+
+        # place holder of label sequence
+        lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None])
+
+        # compute the loss
+        loss = seq2seq.sequence_loss(
+            logits=decoder_logits_train,
+            targets=lbl_word_idx,
+            weights=masks,
+            average_across_timesteps=True,
+            average_across_batch=True)
+
+        # return feeding list and loss operator
+        return {
+            'src_word_idx': src_word_idx,
+            'src_sequence_length': src_sequence_length,
+            'trg_word_idx': trg_word_idx,
+            'trg_sequence_length': trg_sequence_length,
+            'lbl_word_idx': lbl_word_idx
+        }, loss
+    else:
+        start_tokens = tf.ones([tf.shape(src_word_idx)[0], ],
+                               tf.int32) * START_TOKEN_IDX
+        # share the same embedding weights with target word
+        trg_embedding_weights = tf.get_variable(
+            "target_word_embeddings", [target_dict_dim, embedding_dim])
+
+        inference_decoder = beam_search_decoder.BeamSearchDecoder(
+            cell=decoder_cell,
+            embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens),
+            start_tokens=start_tokens,
+            end_token=END_TOKEN_IDX,
+            initial_state=tf.nn.rnn_cell.LSTMStateTuple(
+                tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size),
+                tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)),
+            beam_width=beam_size,
+            output_layer=output_layer)
+
+        decoder_outputs_decode, _, _ = seq2seq.dynamic_decode(
+            decoder=inference_decoder,
+            output_time_major=False,
+            #impute_finished=True,# error occurs
+            maximum_iterations=max_generation_length)
+
+        predicted_ids = decoder_outputs_decode.predicted_ids
+
+        return {
+            'src_word_idx': src_word_idx,
+            'src_sequence_length': src_sequence_length
+        }, predicted_ids
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in vars(args).iteritems():
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def padding_data(data, padding_size, value):
+    data = data + [value] * padding_size
+    return data[:padding_size]
+
+
+def save(sess, path, var_list=None, global_step=None):
+    saver = tf.train.Saver(var_list)
+    save_path = saver.save(sess, save_path=path, global_step=global_step)
+    print('Model save at %s' % save_path)
+
+
+def restore(sess, path, var_list=None):
+    # var_list = None returns the list of all saveable variables
+    saver = tf.train.Saver(var_list)
+    saver.restore(sess, save_path=path)
+    print('model restored from %s' % path)
+
+
+def adapt_batch_data(data):
+    src_seq = map(lambda x: x[0], data)
+    trg_seq = map(lambda x: x[1], data)
+    lbl_seq = map(lambda x: x[2], data)
+
+    src_sequence_length = np.array(
+        [len(seq) for seq in src_seq]).astype('int32')
+    src_seq_maxlen = np.max(src_sequence_length)
+
+    trg_sequence_length = np.array(
+        [len(seq) for seq in trg_seq]).astype('int32')
+    trg_seq_maxlen = np.max(trg_sequence_length)
+
+    src_seq = np.array(
+        [padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
+         for seq in src_seq]).astype('int32')
+
+    trg_seq = np.array(
+        [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
+         for seq in trg_seq]).astype('int32')
+
+    lbl_seq = np.array(
+        [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
+         for seq in lbl_seq]).astype('int32')
+
+    return {
+        'src_word_idx': src_seq,
+        'src_sequence_length': src_sequence_length,
+        'trg_word_idx': trg_seq,
+        'trg_sequence_length': trg_sequence_length,
+        'lbl_word_idx': lbl_seq
+    }
+
+
+def train():
+    feeding_dict, loss = seq_to_seq_net(
+        embedding_dim=args.embedding_dim,
+        encoder_size=args.encoder_size,
+        decoder_size=args.decoder_size,
+        source_dict_dim=args.dict_size,
+        target_dict_dim=args.dict_size,
+        is_generating=False,
+        beam_size=args.beam_size,
+        max_generation_length=args.max_generation_length)
+
+    global_step = tf.Variable(0, trainable=False, name='global_step')
+    trainable_params = tf.trainable_variables()
+    optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+
+    gradients = tf.gradients(loss, trainable_params)
+    # may clip the parameters
+    clip_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
+
+    updates = optimizer.apply_gradients(
+        zip(gradients, trainable_params), global_step=global_step)
+
+    src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
+
+    train_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    def do_validataion():
+        total_loss = 0.0
+        count = 0
+        for batch_id, data in enumerate(test_batch_generator()):
+            adapted_batch_data = adapt_batch_data(data)
+            outputs = sess.run([loss],
+                               feed_dict={
+                                   item[1]: adapted_batch_data[item[0]]
+                                   for item in feeding_dict.items()
+                               })
+            total_loss += outputs[0]
+            count += 1
+        return total_loss / count
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_l)
+        sess.run(init_g)
+        for pass_id in xrange(args.pass_num):
+            pass_start_time = time.time()
+            words_seen = 0
+            for batch_id, data in enumerate(train_batch_generator()):
+                adapted_batch_data = adapt_batch_data(data)
+                words_seen += np.sum(adapted_batch_data['src_sequence_length'])
+                words_seen += np.sum(adapted_batch_data['trg_sequence_length'])
+                outputs = sess.run([updates, loss],
+                                   feed_dict={
+                                       item[1]: adapted_batch_data[item[0]]
+                                       for item in feeding_dict.items()
+                                   })
+                print("pass_id=%d, batch_id=%d, train_loss: %f" %
+                      (pass_id, batch_id, outputs[1]))
+            pass_end_time = time.time()
+            test_loss = do_validataion()
+            time_consumed = pass_end_time - pass_start_time
+            words_per_sec = words_seen / time_consumed
+            print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
+                  (pass_id, test_loss, words_per_sec, time_consumed))
+
+
+def infer():
+    feeding_dict, predicted_ids = seq_to_seq_net(
+        embedding_dim=args.embedding_dim,
+        encoder_size=args.encoder_size,
+        decoder_size=args.decoder_size,
+        source_dict_dim=args.dict_size,
+        target_dict_dim=args.dict_size,
+        is_generating=True,
+        beam_size=args.beam_size,
+        max_generation_length=args.max_generation_length)
+
+    src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    with tf.Session(config=config) as sess:
+        restore(sess, './checkpoint/tf_seq2seq-1500')
+        for batch_id, data in enumerate(test_batch_generator()):
+            src_seq = map(lambda x: x[0], data)
+
+            source_language_seq = [
+                src_dict[item] for seq in src_seq for item in seq
+            ]
+
+            src_sequence_length = np.array(
+                [len(seq) for seq in src_seq]).astype('int32')
+            src_seq_maxlen = np.max(src_sequence_length)
+            src_seq = np.array([
+                padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
+                for seq in src_seq
+            ]).astype('int32')
+
+            outputs = sess.run([predicted_ids],
+                               feed_dict={
+                                   feeding_dict['src_word_idx']: src_seq,
+                                   feeding_dict['src_sequence_length']:
+                                   src_sequence_length
+                               })
+
+            print("\nDecoder result comparison: ")
+            source_language_seq = ' '.join(source_language_seq).lstrip(
+                '').rstrip('').strip()
+            inference_seq = ''
+            print(" --> source: " + source_language_seq)
+            for item in outputs[0][0]:
+                if item[0] == END_TOKEN_IDX: break
+                inference_seq += ' ' + trg_dict.get(item[0], '')
+            print(" --> inference: " + inference_seq)
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    if args.infer_only:
+        infer()
+    else:
+        train()
diff --git a/benchmark/tensorflow/mnist.py b/benchmark/tensorflow/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..7140eed6eaff49b5c65f9ccb2e38f113a4cdbdbf
--- /dev/null
+++ b/benchmark/tensorflow/mnist.py
@@ -0,0 +1,180 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+import numpy as np
+
+import tensorflow as tf
+import paddle.v2 as paddle
+
+DTYPE = tf.float32
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("mnist model benchmark.")
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--iterations', type=int, default=35, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=5, help='The number of passes.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    args = parser.parse_args()
+    return args
+
+
+def run_benchmark(args):
+    def weight_variable(dtype, shape):
+        initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
+        return tf.Variable(initial)
+
+    def bias_variable(dtype, shape):
+        initial = tf.constant(0.1, shape=shape, dtype=dtype)
+        return tf.Variable(initial)
+
+    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
+    with tf.device(device):
+        images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+
+        # conv1, relu, pool1
+        conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
+        conv1_bias = bias_variable(DTYPE, [20])
+        conv1 = tf.nn.conv2d(
+            images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
+        relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
+        pool1 = tf.nn.max_pool(
+            relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
+
+        # conv2, relu, pool2
+        conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
+        conv2_bias = bias_variable(DTYPE, [50])
+        conv2 = tf.nn.conv2d(
+            pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
+        relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
+        pool2 = tf.nn.max_pool(
+            relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
+
+        # FC 
+        pool_shape = pool2.get_shape().as_list()
+        hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
+        reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
+        fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
+        fc_bias = bias_variable(DTYPE, [10])
+        logits = tf.matmul(reshape, fc_weights) + fc_bias
+
+        # Get prediction
+        prediction = tf.nn.softmax(logits)
+
+        # Loss 
+        one_hot_labels = tf.one_hot(labels, depth=10)
+        cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
+        avg_cost = tf.reduce_mean(cost)
+
+        # Get accuracy
+        correct = tf.equal(tf.argmax(prediction, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        # metrics, g_accuracy
+        with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
+            g_accuracy = tf.metrics.accuracy(
+                labels, tf.argmax(
+                    prediction, axis=1))
+            vars = tf.contrib.framework.get_variables(
+                scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
+            g_accuracy_reset_op = tf.variables_initializer(vars)
+
+        # Optimizer 
+        opt = tf.train.AdamOptimizer(
+            learning_rate=0.001, beta1=0.9, beta2=0.999)
+        train_op = opt.minimize(avg_cost)
+        # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+
+    def eval_test():
+        sess.run(g_accuracy_reset_op)
+        for batch_id, data in enumerate(test_reader()):
+            images_data = np.array(
+                map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
+            labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
+
+            loss, acc, g_acc = sess.run(
+                [avg_cost, accuracy, g_accuracy],
+                feed_dict={images: images_data,
+                           labels: labels_data})
+        return g_acc[1]
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+        for pass_id in range(args.pass_num):
+            sess.run(g_accuracy_reset_op)
+
+            pass_start = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                images_data = np.array(
+                    map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
+                labels_data = np.array(map(lambda x: x[1], data)).astype(
+                    "int64")
+
+                start = time.time()
+                _, loss, acc, g_acc = sess.run(
+                    [train_op, avg_cost, accuracy, g_accuracy],
+                    feed_dict={images: images_data,
+                               labels: labels_data})
+                end = time.time()
+
+                print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
+                      (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+
+            pass_end = time.time()
+            test_avg_acc = eval_test()
+
+            print(
+                "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
+                % (pass_id, g_acc[1], test_avg_acc,
+                   (pass_end - pass_start) / 1000))
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    run_benchmark(args)
diff --git a/benchmark/tensorflow/resnet.py b/benchmark/tensorflow/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..c432fa8d59571e128b9ff9e3ffa1949b792ef3a4
--- /dev/null
+++ b/benchmark/tensorflow/resnet.py
@@ -0,0 +1,504 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py
+
+Get help: python resnet.py --help
+See performance on flowers: python resnet.py
+Train on cifar10: python resnet.py --data=cifar10 --with_test
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+import numpy as np
+
+import paddle.v2 as paddle
+import tensorflow as tf
+
+DTYPE = tf.float32
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Convolution model benchmark.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=['resnet'],
+        default='resnet',
+        help='The model architecture.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='use real data or fake data')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations',
+        type=int,
+        default=105,
+        help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=300, help='The number of passes.')
+    parser.add_argument(
+        '--order',
+        type=str,
+        default='NHWC',
+        choices=['NCHW', 'NHWC'],
+        help='The data order, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--data',
+        type=str,
+        default='flowers102',
+        choices=['flowers102', 'cifar10'],
+        help='The kinds of data.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[
+        'with_test'] else vars(args)['iterations']
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def fixed_padding(inputs, kernel_size, data_format):
+    """Pads the input along the spatial dimensions independently of input size.
+  Args:
+    inputs: A tensor of size [batch, channels, height_in, width_in] or
+      [batch, height_in, width_in, channels] depending on data_format.
+    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
+                 Should be a positive integer.
+    data_format: The input format ('channels_last' or 'channels_first').
+  Returns:
+    A tensor with the same format as the input with the data either intact
+    (if kernel_size == 1) or padded (if kernel_size > 1).
+  """
+    pad_total = kernel_size - 1
+    pad_beg = pad_total // 2
+    pad_end = pad_total - pad_beg
+
+    if data_format == 'channels_first':
+        padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end],
+                                        [pad_beg, pad_end]])
+    else:
+        padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
+                                        [pad_beg, pad_end], [0, 0]])
+    return padded_inputs
+
+
+def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
+    """Strided 2-D convolution with explicit padding."""
+    # The padding is consistent and is based only on `kernel_size`, not on the
+    # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
+    # This is consistent with PaddlePaddle.
+    # In addition, the calculation for output size in TensorFlow can refer: 
+    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc
+    if strides > 1:
+        inputs = fixed_padding(inputs, kernel_size, data_format)
+
+    return tf.layers.conv2d(
+        inputs=inputs,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=('SAME' if strides == 1 else 'VALID'),
+        use_bias=False,
+        kernel_initializer=tf.variance_scaling_initializer(),
+        data_format=data_format)
+
+
+def conv_bn(inputs,
+            filters,
+            kernel_size,
+            strides,
+            is_training,
+            data_format,
+            act=True):
+    # def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
+    # set fused=True for a significant performance boost. See
+    # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
+    inputs = conv2d_fixed_padding(
+        inputs=inputs,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        data_format=data_format)
+    inputs = tf.layers.batch_normalization(
+        inputs=inputs,
+        axis=1 if data_format == 'channels_first' else 3,
+        momentum=0.9,
+        epsilon=1e-05,
+        center=True,
+        scale=True,
+        training=is_training,
+        fused=True)
+    if act:
+        inputs = tf.nn.relu(inputs)
+    return inputs
+
+
+def basicblock(inputs, filters, is_training, projection_shortcut, strides,
+               data_format):
+    shortcut = inputs
+    if projection_shortcut is not None:
+        shortcut = projection_shortcut(inputs)
+    inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format)
+    inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
+    inputs = inputs + shortcut
+    inputs = tf.nn.relu(inputs)
+    return inputs
+
+
+def bottleneck(inputs, filters, is_training, projection_shortcut, strides,
+               data_format):
+    shortcut = inputs
+    if projection_shortcut is not None:
+        shortcut = projection_shortcut(inputs)
+    inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format)
+    inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
+    inputs = conv_bn(
+        inputs, filters * 4, 1, 1, is_training, data_format, act=False)
+    inputs = inputs + shortcut
+    inputs = tf.nn.relu(inputs)
+    return inputs
+
+
+def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name,
+                data_format):
+    # Bottleneck blocks end with 4x the number of filters as they start with
+    filters_out = 4 * filters if block_fn is bottleneck else filters
+
+    def projection_shortcut(inputs):
+        return conv2d_fixed_padding(
+            inputs=inputs,
+            filters=filters_out,
+            kernel_size=1,
+            strides=strides,
+            data_format=data_format)
+
+    # Only the first block per block_layer uses projection_shortcut and strides
+    inputs = block_fn(inputs, filters, is_training, projection_shortcut,
+                      strides, data_format)
+
+    for _ in range(1, blocks):
+        inputs = block_fn(inputs, filters, is_training, None, 1, data_format)
+
+    return tf.identity(inputs, name)
+
+
+def resnet_imagenet(depth, class_dim, data_format):
+    """Returns the ResNet model for a given size and number of output classes."""
+
+    def resnet_generator(block_fn,
+                         layers,
+                         num_classes,
+                         data_format='channels_last'):
+        if data_format is None:
+            data_format = ('channels_first'
+                           if tf.test.is_built_with_cuda() else 'channels_last')
+
+        def model(inputs, is_training):
+            """Constructs the ResNet model given the inputs."""
+            if data_format == 'channels_first':
+                # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
+                # This provides a large performance boost on GPU. See
+                # https://www.tensorflow.org/performance/performance_guide#data_formats
+                inputs = tf.transpose(inputs, [0, 3, 1, 2])
+
+            inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format)
+            inputs = tf.identity(inputs, 'initial_conv')
+            inputs = tf.layers.max_pooling2d(
+                inputs=inputs,
+                pool_size=3,
+                strides=2,
+                padding='SAME',
+                data_format=data_format)
+            inputs = tf.identity(inputs, 'initial_max_pool')
+            inputs = block_layer(inputs, 64, block_fn, layers[0], 1,
+                                 is_training, 'block_layer1', data_format)
+            inputs = block_layer(inputs, 128, block_fn, layers[1], 2,
+                                 is_training, 'block_layer2', data_format)
+            inputs = block_layer(inputs, 256, block_fn, layers[2], 2,
+                                 is_training, 'block_layer3', data_format)
+            inputs = block_layer(inputs, 512, block_fn, layers[3], 2,
+                                 is_training, 'block_layer4', data_format)
+            inputs = tf.layers.average_pooling2d(
+                inputs=inputs,
+                pool_size=7,
+                strides=1,
+                padding='VALID',
+                data_format=data_format)
+            inputs = tf.identity(inputs, 'final_avg_pool')
+            inputs = tf.reshape(inputs,
+                                [-1, 512 if block_fn is basicblock else 2048])
+            inputs = tf.layers.dense(inputs=inputs, units=num_classes)
+            inputs = tf.identity(inputs, 'final_dense')
+            return inputs
+
+        return model
+
+    model_params = {
+        18: {
+            'block': basicblock,
+            'layers': [2, 2, 2, 2]
+        },
+        34: {
+            'block': basicblock,
+            'layers': [3, 4, 6, 3]
+        },
+        50: {
+            'block': bottleneck,
+            'layers': [3, 4, 6, 3]
+        },
+        101: {
+            'block': bottleneck,
+            'layers': [3, 4, 23, 3]
+        },
+        152: {
+            'block': bottleneck,
+            'layers': [3, 8, 36, 3]
+        },
+        200: {
+            'block': bottleneck,
+            'layers': [3, 24, 36, 3]
+        }
+    }
+    if depth not in model_params:
+        raise ValueError('Not a valid depth:', depth)
+    params = model_params[depth]
+    return resnet_generator(params['block'], params['layers'], class_dim,
+                            data_format)
+
+
+def resnet_cifar10(depth, num_classes, data_format):
+    if depth % 6 != 2:
+        raise ValueError('depth must be 6n + 2:', depth)
+
+    num_blocks = (depth - 2) // 6
+
+    if data_format is None:
+        data_format = ('channels_first'
+                       if tf.test.is_built_with_cuda() else 'channels_last')
+
+    def model(inputs, is_training):
+        inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format)
+        inputs = tf.identity(inputs, 'initial_conv')
+        inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training,
+                             'block_layer1', data_format)
+        inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training,
+                             'block_layer2', data_format)
+        inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training,
+                             'block_layer3', data_format)
+        inputs = tf.layers.average_pooling2d(
+            inputs=inputs,
+            pool_size=8,
+            strides=1,
+            padding='VALID',
+            data_format=data_format)
+        inputs = tf.identity(inputs, 'final_avg_pool')
+        inputs = tf.reshape(inputs, [-1, 64])
+        inputs = tf.layers.dense(inputs=inputs, units=num_classes)
+        inputs = tf.identity(inputs, 'final_dense')
+        return inputs
+
+    return model
+
+
+def run_benchmark(args, data_format='channels_last', device='/cpu:0'):
+    """Our model_fn for ResNet to be used with our Estimator."""
+
+    class_dim = 1000
+    dshape = (None, 224, 224, 3)
+
+    pdshape = (3, 224, 224)
+    if args.data == 'flowers102':
+        class_dim = 102
+        dshape = (None, 224, 224, 3)
+        pdshape = (3, 224, 224)
+    elif args.data == 'cifar10':
+        class_dim = 10
+        dshape = (None, 32, 32, 3)
+        pdshape = (3, 32, 32)
+
+    with tf.device(device):
+        images = tf.placeholder(DTYPE, shape=dshape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+
+        network = resnet_cifar10(
+            32, class_dim,
+            data_format) if args.data == 'cifar10' else resnet_imagenet(
+                50, class_dim, data_format)
+
+        logits = network(inputs=images, is_training=is_training)
+
+        cross_entropy = tf.losses.softmax_cross_entropy(
+            logits=logits, onehot_labels=onehot_labels)
+        avg_cost = tf.reduce_mean(cross_entropy)
+
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        lr = 0.1 if args.data == 'cifar10' else 0.01
+        optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
+
+        # Batch norm requires update_ops to be added as a train_op dependency.
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=100)
+
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+                map(lambda x: np.transpose(x[0].reshape(pdshape),
+                axes=[1, 2, 0]), data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" %
+              (pass_id, num_samples / train_elapsed, np.mean(test_accs)))
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+
+        if args.use_fake_data:
+            data = train_reader().next()
+            images_data = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(pdshape),
+                    axes=[1, 2, 0]), data)).astype("float32")
+            labels_data = np.array(map(lambda x: x[1], data)).astype('int64')
+        iters, num_samples, start_time = 0, 0, 0.0
+        for pass_id in range(args.pass_num):
+            if iters == args.iterations:
+                break
+            train_accs = []
+            train_losses = []
+            for batch_id, data in enumerate(train_reader()):
+                if iters == args.skip_batch_num:
+                    start_time = time.time()
+                    num_samples = 0
+                if iters == args.iterations:
+                    break
+                if not args.use_fake_data:
+                    images_data = np.array(
+                        map(lambda x: np.transpose(x[0].reshape(pdshape),
+                        axes=[1, 2, 0]), data)).astype("float32")
+                    labels_data = np.array(map(lambda x: x[1], data)).astype(
+                        'int64')
+                _, loss, acc = sess.run([train_op, avg_cost, accuracy],
+                                        feed_dict={
+                                            images: images_data,
+                                            labels: labels_data,
+                                            is_training: True
+                                        })
+                iters += 1
+                train_accs.append(acc)
+                train_losses.append(loss)
+                num_samples += len(data)
+                print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" %
+                      (pass_id, iters, loss, acc))
+
+            train_elapsed = time.time() - start_time
+            print("Pass=%d, Loss=%f, Accuray=%f\n" %
+                  (pass_id, np.mean(train_losses), np.mean(train_accs)))
+
+            # evaluation
+            if args.with_test:
+                test()
+
+        if not args.with_test:
+            duration = time.time() - start_time
+            examples_per_sec = num_samples / duration
+            sec_per_batch = duration / (iters - args.skip_batch_num)
+
+            print('Total examples: %d, total time: %.5f' %
+                  (num_samples, duration))
+            print('%.5f examples/sec, %.5f sec/batch' %
+                  (examples_per_sec, sec_per_batch))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if tf.test.is_built_with_cuda():
+        device = '/device:GPU:0'
+        if args.order == 'NHWC':
+            data_format = 'channels_last'
+        else:
+            data_format = 'channels_first'
+    else:
+        device = '/cpu:0'
+        if args.order == 'NHWC':
+            data_format = 'channels_last'
+        else:
+            raise ValueError('Only support NHWC order in CPU mode')
+
+    run_benchmark(args, data_format, device)
diff --git a/benchmark/tensorflow/stacked_dynamic_lstm.py b/benchmark/tensorflow/stacked_dynamic_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5285033005044d907d0b7e91eb66ee7281c4f27a
--- /dev/null
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
@@ -0,0 +1,220 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import tensorflow as tf
+
+import paddle.v2 as paddle
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("LSTM model benchmark.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--stacked_num',
+        type=int,
+        default=5,
+        help='Number of lstm layers to stack. (default: %(default)d)')
+    parser.add_argument(
+        '--embedding_dim',
+        type=int,
+        default=512,
+        help='Dimension of embedding table. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=512,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=10,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=0.0002,
+        help='Learning rate used to train. (default: %(default)f)')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def dynamic_lstm_model(dict_size,
+                       embedding_dim,
+                       hidden_dim,
+                       stacked_num,
+                       class_num=2,
+                       is_train=True):
+    word_idx = tf.placeholder(tf.int64, shape=[None, None])
+    sequence_length = tf.placeholder(tf.int64, shape=[None, ])
+
+    embedding_weights = tf.get_variable('word_embeddings',
+                                        [dict_size, embedding_dim])
+    embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
+
+    lstm_cell = tf.nn.rnn_cell.LSTMCell(
+        num_units=hidden_dim, use_peepholes=False)
+    stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
+
+    # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
+    _, final_state = tf.nn.dynamic_rnn(
+        cell=stacked_cell,
+        inputs=embedding,
+        dtype=tf.float32,
+        sequence_length=sequence_length)
+
+    w = tf.Variable(
+        tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
+    bias = tf.Variable(
+        tf.constant(
+            value=0.0, shape=[class_num], dtype=tf.float32))
+    prediction = tf.matmul(final_state[-1][1], w) + bias
+
+    if not is_train:
+        return (word_idx, sequence_length), tf.nn.softmax(prediction)
+
+    label = tf.placeholder(tf.int64, shape=[None, ])
+    loss = tf.nn.softmax_cross_entropy_with_logits(
+        labels=tf.one_hot(label, 2), logits=prediction)
+    avg_loss = tf.reduce_mean(loss)
+
+    correct_count = tf.equal(tf.argmax(prediction, 1), label)
+    acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
+
+    with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
+        g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
+        vars = tf.contrib.framework.get_variables(
+            scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
+        reset_op = tf.variables_initializer(vars)
+
+    return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
+
+
+def padding_data(data, padding_size, value):
+    data = data + [value] * padding_size
+    return data[:padding_size]
+
+
+def train(args):
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_size = len(word_dict)
+
+    feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
+        dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
+
+    adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+    train_op = adam_optimizer.minimize(avg_loss)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=args.batch_size)
+
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.test(word_dict), buf_size=25000),
+        batch_size=args.batch_size)
+
+    def do_validation(sess):
+        sess.run(reset_op)
+        for batch_id, data in enumerate(test_reader()):
+            word_idx = map(lambda x: x[0], data)
+            sequence_length = np.array(
+                [len(seq) for seq in word_idx]).astype('int64')
+            maxlen = np.max(sequence_length)
+            word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
+            word_idx = np.array(word_idx).astype('int64')
+            label = np.array(map(lambda x: x[1], data)).astype('int64')
+
+            _, loss, fetch_acc, fetch_g_acc = sess.run(
+                [train_op, avg_loss, acc, g_acc],
+                feed_dict={
+                    feeding_list[0]: word_idx,
+                    feeding_list[1]: sequence_length,
+                    feeding_list[2]: label
+                })
+
+        return fetch_g_acc[1]
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_l)
+        sess.run(init_g)
+
+        for pass_id in xrange(args.pass_num):
+            # clear accuracy local variable 
+            sess.run(reset_op)
+            pass_start_time = time.time()
+            words_seen = 0
+
+            for batch_id, data in enumerate(train_reader()):
+                word_idx = map(lambda x: x[0], data)
+                sequence_length = np.array(
+                    [len(seq) for seq in word_idx]).astype('int64')
+                words_seen += np.sum(sequence_length)
+                maxlen = np.max(sequence_length)
+                word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
+                word_idx = np.array(word_idx).astype('int64')
+                label = np.array(map(lambda x: x[1], data)).astype('int64')
+
+                _, loss, fetch_acc, fetch_g_acc = sess.run(
+                    [train_op, avg_loss, acc, g_acc],
+                    feed_dict={
+                        feeding_list[0]: word_idx,
+                        feeding_list[1]: sequence_length,
+                        feeding_list[2]: label
+                    })
+
+                print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
+                      % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
+
+            pass_end_time = time.time()
+            time_consumed = pass_end_time - pass_start_time
+            words_per_sec = words_seen / time_consumed
+            test_acc = do_validation(sess)
+            print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
+                  (pass_id, test_acc, words_per_sec, time_consumed))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+
+    if args.infer_only:
+        pass
+    else:
+        train(args)
diff --git a/benchmark/tensorflow/vgg.py b/benchmark/tensorflow/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba5ec71a46b3ac8b2e1244424c39fd5192e5458
--- /dev/null
+++ b/benchmark/tensorflow/vgg.py
@@ -0,0 +1,324 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in TensorFlow"""
+import tensorflow as tf
+import paddle.v2 as paddle
+import numpy as np
+import argparse
+import time
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NHWC',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, NCHW=[batch, channels, height, width].'
+    'Only support NHWC right now.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+args = parser.parse_args()
+
+
+class VGG16Model(object):
+    def __init__(self):
+        self.parameters = []
+
+    def batch_norm_relu(self, inputs, is_training):
+        """Performs a batch normalization followed by a ReLU."""
+        # We set fused=True for a significant speed boost. See
+        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
+        inputs = tf.layers.batch_normalization(
+            inputs=inputs,
+            axis=1 if args.data_format == 'NCHW' else -1,
+            momentum=0.9,
+            epsilon=1e-05,
+            center=True,
+            scale=True,
+            training=is_training,
+            fused=True)
+        inputs = tf.nn.relu(inputs)
+        return inputs
+
+    def conv_bn_layer(self,
+                      name,
+                      images,
+                      kernel_shape,
+                      is_training,
+                      drop_rate=0.0):
+        with tf.name_scope(name) as scope:
+            kernel = tf.Variable(
+                tf.truncated_normal(
+                    kernel_shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            conv = tf.nn.conv2d(
+                images,
+                kernel, [1, 1, 1, 1],
+                data_format=args.data_format,
+                padding='SAME')
+            biases = tf.Variable(
+                tf.constant(
+                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(conv, biases)
+            out = self.batch_norm_relu(out, is_training)
+            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
+            return out
+
+    def fc_layer(self, name, inputs, shape):
+        with tf.name_scope(name) as scope:
+            fc_w = tf.Variable(
+                tf.truncated_normal(
+                    shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            fc_b = tf.Variable(
+                tf.constant(
+                    0.0, shape=[shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
+            return out
+
+    def network(self, images, class_dim, is_training):
+        """ VGG16 model structure.
+
+            TODO(kuke): enable this network to support the 'NCHW' data format
+        """
+
+        # conv1
+        conv1_1 = self.conv_bn_layer(
+            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
+        conv1_2 = self.conv_bn_layer(
+            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
+        # pool1
+        pool1 = tf.nn.max_pool(
+            conv1_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool1')
+        # conv2
+        conv2_1 = self.conv_bn_layer(
+            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
+        conv2_2 = self.conv_bn_layer(
+            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
+        # pool2
+        pool2 = tf.nn.max_pool(
+            conv2_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool2')
+        # conv3
+        conv3_1 = self.conv_bn_layer(
+            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
+        conv3_2 = self.conv_bn_layer(
+            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
+        conv3_3 = self.conv_bn_layer(
+            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
+        # pool3
+        pool3 = tf.nn.max_pool(
+            conv3_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool3')
+        # conv4
+        conv4_1 = self.conv_bn_layer(
+            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
+        conv4_2 = self.conv_bn_layer(
+            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv4_3 = self.conv_bn_layer(
+            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool4
+        pool4 = tf.nn.max_pool(
+            conv4_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # conv5
+        conv5_1 = self.conv_bn_layer(
+            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_2 = self.conv_bn_layer(
+            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_3 = self.conv_bn_layer(
+            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool5
+        pool5 = tf.nn.max_pool(
+            conv5_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # flatten
+        shape = int(np.prod(pool5.get_shape()[1:]))
+        pool5_flat = tf.reshape(pool5, [-1, shape])
+        # fc1
+        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
+        fc1 = self.fc_layer('fc1', drop, [shape, 512])
+        # fc2
+        bn = self.batch_norm_relu(fc1, is_training)
+        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
+        fc2 = self.fc_layer('fc2', drop, [512, 512])
+
+        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
+
+        return fc3
+
+
+def run_benchmark():
+    """Run benchmark on cifar10 or flowers."""
+
+    if args.data_set == "cifar10":
+        class_dim = 10
+        raw_shape = (3, 32, 32)
+        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
+            None, 3, 32, 32)
+    else:
+        class_dim = 102
+        raw_shape = (3, 224, 224)
+        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
+            None, 3, 224, 224)
+
+    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
+
+    with tf.device(device):
+        images = tf.placeholder(tf.float32, shape=dat_shape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+
+        vgg16 = VGG16Model()
+        logits = vgg16.network(images, class_dim, is_training)
+        loss = tf.losses.softmax_cross_entropy(
+            onehot_labels=onehot_labels, logits=logits)
+        avg_loss = tf.reduce_mean(loss)
+
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_loss)
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+
+    # test
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+         map(lambda x: np.transpose(x[0].reshape(raw_shape),
+         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        return np.mean(test_accs)
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+        iters, num_samples, start_time = 0, 0, time.time()
+        for pass_id in range(args.num_passes):
+            # train
+            num_samples = 0
+            start_time = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                if iters == args.skip_batch_num:
+                    start_time = time.time()
+                    num_samples = 0
+                if iters == args.iterations:
+                    break
+                train_images = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
+                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+                train_labels = np.array(map(lambda x: x[1], data)).astype(
+                    'int64')
+                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
+                                        feed_dict={
+                                            images: train_images,
+                                            labels: train_labels,
+                                            is_training: True
+                                        })
+                iters += 1
+                num_samples += len(data)
+                print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
+                      (pass_id, iters, loss, acc))
+            train_elapsed = time.time() - start_time
+            # test
+            pass_test_acc = test()
+            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
+                  (pass_id, num_samples / train_elapsed, pass_test_acc))
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    print_arguments()
+    run_benchmark()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 0f76f55270592c5625a9624b33f4c0f82efdc627..f726405c4773994f6ca6509e5218750805b03995 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -57,11 +57,7 @@ if(NOT WITH_GOLANG)
     add_definitions(-DPADDLE_WITHOUT_GOLANG)
 endif(NOT WITH_GOLANG)
 
-if(NOT WITH_GPU)
-    add_definitions(-DHPPL_STUB_FUNC)
-
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-else()
+if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
 
     FIND_PACKAGE(CUDA REQUIRED)
@@ -84,7 +80,14 @@ else()
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif(NOT WITH_GPU)
+elseif(WITH_AMD_GPU)
+    add_definitions(-DPADDLE_WITH_HIP)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
+else()
+    add_definitions(-DHPPL_STUB_FUNC)
+    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+endif()
 
 if (WITH_MKLML AND MKLML_IOMP_LIB)
     message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index d9cd264b49d546c35a2c57a82ead83ea654b60ae..10662fc96704685f030a5d76c6857d4bc20a63d9 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -24,7 +24,7 @@ set(BOOST_PROJECT       "extern_boost")
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
 set(BOOST_TAR           "boost_1_41_0")
-set(BOOST_URL           "http://paddlepaddledeps.s3-website-us-west-1.amazonaws.com/${BOOST_TAR}.tar.gz")
+set(BOOST_URL           "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 6a701e076c95372f903a09d35d4208ee73bd584c..73d70c34dce8bedd9e62519c207e5be3dcf7dba3 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -4,18 +4,33 @@ SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
 
-ExternalProject_Add(
-    extern_eigen3
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
-    PREFIX          ${EIGEN_SOURCE_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   ""
-    TEST_COMMAND      ""
-)
+if(WITH_AMD_GPU)
+    ExternalProject_Add(
+        extern_eigen3
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
+        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+        PREFIX          ${EIGEN_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+    )
+else()
+    ExternalProject_Add(
+        extern_eigen3
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+        GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
+        PREFIX          ${EIGEN_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+    )
+endif()
 
 if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index a25cff5fc567f22d4573625487f31bd4192bb172..5759e5c489724332793bf103b7aacf7ffb068611 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -36,7 +36,8 @@ MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
 
-INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
+INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h
 
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
     SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 739a910c7c670b7b9f89e543582a32a80546fb11..796bcf28a1dfb308ccb7a2f839742c5c2fcf2002 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -28,13 +28,13 @@ INCLUDE(ExternalProject)
 
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
+SET(MKLML_URL           "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
 SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
-SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
+SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
 SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
@@ -46,7 +46,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
 FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
   "PROJECT(MKLML)\n"
   "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${MKLML_VER}\n"
+  "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
   "        DESTINATION ${MKLML_DST_DIR})\n")
 
 ExternalProject_Add(
diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake
deleted file mode 100644
index af5c689c3524741a88518eeb3f85996872257677..0000000000000000000000000000000000000000
--- a/cmake/external/nccl.cmake
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT WITH_GPU)
-  return()
-endif()
-
-include(ExternalProject)
-
-set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
-
-include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
-
-if(WITH_DSO)
-  # If we use DSO, we do not build nccl, just download the dependencies
-  set(NCCL_BUILD_COMMAND "")
-  set(NCCL_INSTALL_COMMAND "")
-  set(NCCL_INSTALL_DIR "")
-else()
-  # otherwise, we build nccl and link it.
-  set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
-  # Note: cuda 8.0 is needed to make nccl
-  # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
-  set(NCCL_BUILD_COMMAND "make -j 8")
-  set(NCCL_INSTALL_COMMAND  "make install PREFIX=${NCCL_INSTALL_DIR}")
-endif()
-
-ExternalProject_Add(
-    extern_nccl
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
-    GIT_TAG         "v1.3.4-1"
-    PREFIX          "${NCCL_SOURCE_DIR}"
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     "${NCCL_BUILD_COMMAND}"
-    INSTALL_COMMAND   "${NCCL_INSTALL_COMMAND}"
-    INSTALL_DIR       "${NCCL_INSTALL_DIR}"
-    TEST_COMMAND      ""
-)
-
-if(WITH_DSO)
-  if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
-    add_library(nccl STATIC ${dummyfile})
-  else()
-    add_library(nccl INTERFACE)
-  endif()
-else()
-  add_library(nccl STATIC IMPORTED GLOBAL)
-  set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
-               ${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
-endif()
-
-add_dependencies(nccl extern_nccl)
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 5377a0b046a796cd6f0bb1fb466e1cd0b4b678bf..8f7a3bf8eeaef75c8840f4ea318b484d33249bb7 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -54,5 +54,7 @@ add_library(snappystream STATIC IMPORTED GLOBAL)
 set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
         "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 
-include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
+include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
+include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
+
 add_dependencies(snappystream extern_snappystream)
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0159815fed81bdff6de3e561af569e9edc75f947
--- /dev/null
+++ b/cmake/external/threadpool.cmake
@@ -0,0 +1,30 @@
+INCLUDE(ExternalProject)
+
+SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
+SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
+INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
+
+ExternalProject_Add(
+    extern_threadpool
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/progschj/ThreadPool.git"
+    GIT_TAG         9a42ec1329f259a5f4881a291db1dcb8f2ad9040
+    PREFIX          ${THREADPOOL_SOURCE_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/threadpool_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_threadpool = \"${dummyfile}\";")
+    add_library(simple_threadpool STATIC ${dummyfile})
+else()
+    add_library(simple_threadpool INTERFACE)
+endif()
+
+add_dependencies(simple_threadpool extern_threadpool)
+
+LIST(APPEND external_project_dependencies simple_threadpool)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 9a9a20f897e09b823dfb19ff841c3f2aeb3f9fe6..a631ad14b18310598f7eea3a51839d61a9e456ff 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -62,7 +62,8 @@ ExternalProject_Add(
 )
 
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
 
 ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 20b8506e678af4db6ccb65bef99d28e085a67bf2..c3d73235453c8c9fd2859c3ab142888e8bda2dbe 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -25,7 +25,8 @@ ELSE(WIN32)
   SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
 ENDIF(WIN32)
 
-INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
 
 ExternalProject_Add(
     extern_zlib
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 471e3929069d0d28105404b4f0f6baa303faf0e0..c4c9f77df8d57fe162616d2250bd4dfe5b7754e7 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -244,14 +244,14 @@ function(cc_test TARGET_NAME)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
     # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
       list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
     endif()
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
 endfunction(cc_test)
 
@@ -311,12 +311,88 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
   endif()
 endfunction(nv_test)
 
+function(hip_library TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_library_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    if(hip_library_SRCS)
+      if (hip_library_SHARED OR hip_library_shared) # build *.so
+        add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+      else()
+        add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
+	find_fluid_modules(${TARGET_NAME})
+      endif()
+      if (hip_library_DEPS)
+	add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
+	target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${hip_library_SRCS})
+	string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+	if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	endif()
+      endforeach()
+      add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
+    else(hip_library_SRCS)
+      if (hip_library_DEPS)
+	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
+      else()
+	message(FATAL "Please specify source file or library in nv_library.")
+      endif()
+    endif(hip_library_SRCS)
+  endif()
+endfunction(hip_library)
+
+function(hip_binary TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
+    if(hip_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+    endif()
+  endif()
+endfunction(hip_binary)
+
+function(hip_test TARGET_NAME)
+  if (WITH_AMD_GPU AND WITH_TESTING)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_test_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
+    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+  endif()
+endfunction(hip_test)
+
 function(go_library TARGET_NAME)
   set(options STATIC static SHARED shared)
   set(oneValueArgs "")
@@ -485,9 +561,9 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
+             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
 endfunction()
 
@@ -511,6 +587,9 @@ function(grpc_library TARGET_NAME)
   get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
   get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
 
+  #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
+  # somehow it didn't. line 602 to 604 is to patching this. Leaving this here 
+  # for now to enable dist CI.
   protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
   set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
   set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
@@ -521,6 +600,9 @@ function(grpc_library TARGET_NAME)
           COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
           ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
           --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+          ARGS --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+          "${ABS_PROTO}"
           DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
 
   # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..bfe491bd6b7602959d3dd60bd06c67993593cc9b
--- /dev/null
+++ b/cmake/hip.cmake
@@ -0,0 +1,43 @@
+if(NOT WITH_AMD_GPU)
+    return()
+endif()
+
+include_directories("/opt/rocm/include")
+include_directories("/opt/rocm/hipblas/include")
+include_directories("/opt/rocm/hiprand/include")
+include_directories("/opt/rocm/rocrand/include")
+include_directories("/opt/rocm/rccl/include")
+include_directories("/opt/rocm/thrust")
+
+list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
+
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
+
+if(WITH_DSO)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
+endif(WITH_DSO)
+
+if(WITH_DOUBLE)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
+endif(WITH_DOUBLE)
+
+if(WITH_TESTING)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
+endif(WITH_TESTING)
+
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+
+if("x${HCC_HOME}" STREQUAL "x")
+  set(HCC_HOME "/opt/rocm/hcc")
+endif()
+
+set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME}     -o  ")
+set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME}    -o   -shared")
+set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME}    -o   -shared")
+
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 6b2237b858380f384be0aa3c6ae24a4c83ad646d..0323cd9698cba916d2aa04403be97c0a6a463830 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -69,6 +69,12 @@ if(NOT CBLAS_FOUND)
       SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
       DSTS ${dst_dir} ${dst_dir}
     )
+elseif (WITH_MKLML)
+    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
+    copy(mklml_lib
+      SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
+      DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
+    )
 endif()
 
 # paddle fluid module
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index da67701ec1af57df742dce105990cffa40f45d7c..7066637a7cb27b83724cb4030c29a1019981f52b 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -1 +1,9 @@
+add_custom_target(paddle_apis ALL
+                  DEPENDS paddle_v2_apis paddle_fluid_apis)
+
+add_custom_target(paddle_docs ALL
+                  DEPENDS paddle_v2_docs paddle_v2_docs_cn
+                  paddle_fluid_docs paddle_fluid_docs_cn)
+
 add_subdirectory(v2)
+add_subdirectory(fluid)
diff --git a/doc/design/file_manager/README.md b/doc/design/file_manager/README.md
deleted file mode 100644
index 3df10d801e568834729f902aace483d033340e2d..0000000000000000000000000000000000000000
--- a/doc/design/file_manager/README.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# FileManager设计文档
-## 目标
-在本文档中,我们设计说明了名为FileManager系统,方便用户上传自己的训练数据以进行分布式训练
-
-主要功能包括:
-
-- 提供常用的命令行管理命令管理文件和目录
-- 支持大文件的断点上传、下载  
-
-## 名词解释
-- PFS:是`Paddlepaddle cloud File System`的缩写,是对用户文件存储空间的抽象,与之相对的是local filesystem。目前我们用CephFS来搭建。
-- [CephFS](http://docs.ceph.com/docs/master/cephfs/):一个POSIX兼容的文件系统。
-- Chunk:逻辑划上文件分块的单位。
-
-## 模块
-### 架构图
-
-
-### PFSClient
-- 功能: 详细设计[link](./pfs/pfsclient.md)
-	- 提供用户管理文件的命令
-	- 需要可以跨平台执行
-
-- 双向验证   
-	PFSClient需要和Ingress之间做双向验证[tls](#tls),所以用户需要首先在`cloud.paddlepaddle.org`上注册一下,申请用户空间,并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地,然后才能使用PFSClient。
-		
-### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/)
-- 功能:  
-	提供七层协议的反向代理、基于粘性会话的负载均衡功能。
-	
-- 透传用户身份的办法  
-	Ingress需要把PFSClient的身份信息传给PFSServer,配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3)
-
-### PFSServer
-PFSServer提供RESTful API接口,接收处理PFSClient端的文件管理请求,并且把结果返回PFSClient端。
-
-RESTful API
-
-- /api/v1/files
-	- `GET /api/v1/files`: Get metadata of files or directories.
-	- `POST /api/v1/files`: Create files or directories.
-	- `PATCH /api/v1/files`: Update files or directories.
-	- `DELETE /api/v1/files`: Delete files or directories.
-
-- /api/v1/file/chunks
-	- `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file.
-
-- /api/v1/storage/files
-	- `GET /api/v1/storage/files`: Download files or directories.
-	- `POST /api/v1/storage/files`: Upload files or directories.
-
-- /api/v1/storage/file/chunks
-	- `GET /api/v1/storage/file/chunks`: Download chunks's data.
-	- `POST /api/v1/storage/file/chunks`: Upload chunks's data.
-
-## 文件传输优化
-
-### 分块文件传输
-用户文件可能是比较大的,上传到Cloud或者下载到本地的时间可能比较长,而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题,我们提出了Chunk的概念,一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小(默认256K),完成一个传输动作完成的时间也比较短,不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。
-
-一个典型的Chunk如下所示:
-
-```
-type Chunk struct {
-	fileOffset int64
-	checksum uint32
-	len     uint32
-	data    []byte
-}
-```  
-
-### 生成sparse文件
-当destination文件不存在或者大小和source文件不一致时,可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件,然后就可以并发写入多个Chunk。
-
-### 覆盖不一致的部分
-文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致,不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。
-
-## 用户使用流程
-参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md)
-
-## 框架生成
-用[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分,以便我们可以把更多的精力放到逻辑本身上。
-
-## 参考文档
-- [TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md)
-- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/)
-- [linux man document](https://linux.die.net/man/)
diff --git a/doc/design/file_manager/pfs/pfsclient.md b/doc/design/file_manager/pfs/pfsclient.md
deleted file mode 100644
index 56bc70c54bbc92b78d66e04fb495b1300cf8ebe0..0000000000000000000000000000000000000000
--- a/doc/design/file_manager/pfs/pfsclient.md
+++ /dev/null
@@ -1,129 +0,0 @@
-# PFSClient
-
-## Description
-The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud
-
-## Synopsis
-```
-paddle [options] pfs  [parameters]
-```
-
-## Options
-```
---profile (string)
-	Use a specific profile from your credential file.
-
---help (string)
-	Display more information about command
-
---version
-	Output version information and exit
-
---debug
-	Show detailed debugging log	
-	
---only-show-errors (boolean) 
-	Only errors and warnings are displayed. All other output is suppressed.
-```
-
-## Path Arguments
-When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`.  
-
-A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`.
-
-[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters.
-
-## order of Path Arguments
-Commonly, if there are two path arguments, the first is the source, and the second is the destination.
-
-## Subcommonds
-- rm - remove files or directories
-
-```
-Synopsis:
-	rm [-r] [-v]  ...
-
-Options:
-	-r 
-		Remove directories and their contents recursively 
-	-v      
-		Cause rm to be verbose, showing files after they are removed.
-	
-Examples:
-	paddle pfs rm /pfs/$DATACENTER/home/$USER/file
-	paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder
-```
-- mv - move (rename) files
-
-```
-Synopsis:
-	mv [-f | -n] [-v]  
-	mv [-f | -n] [-v]  ... 
-	mv [-f | -n] [-v]   
-	mv [-f | -n] [-v]  ...  
-	mv [-f | -n] [-v]   
-	mv [-f | -n] [-v]  ...  
-	
-Options:
-	-f      
-		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
-	-n      
-		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
-	-v      
-		Cause mv to be verbose, showing files after they are moved.
-		
-Examples:
-	paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt
-```
-- cp - copy files or directories
-
-```
-Synopsis:
-	cp [-r] [-f | -n] [-v] [--preserve--links]  
-	cp [-r] [-f | -n] [-v] [--preserve--links]  ... 
-	cp [-r] [-f | -n] [-v] [--preserve--links]   
-	cp [-r] [-f | -n] [-v] [--preserve--links]  ... 
-	cp [-r] [-f | -n] [-v] [--preserve--links]   
-	cp [-r] [-f | -n] [-v] [--preserve--links]  ... 
-
-Options:
-	-r
-   		Copy directories recursively
-   	-f      
-		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
-	-n      
-		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
-	-v      
-		Cause cp to be verbose, showing files after they are copied.
-	--preserve--links
-	   Reserve links when copy links
-	   
-Examples:
-	paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file
-	paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file
-```
-- ls- list files
-
-```
-Synopsis:
-	ls [-r]  ...
-	
-Options:
-	-R
-   		List directory(ies) recursively
-
-Examples:
-	paddle pfs ls  /pfs/$DATACENTER/home/$USER/file
-	paddle pfs ls  /pfs/$DATACENTER/home/$USER/folder
-```
-
-- mkdir - mkdir directory(ies)
-Create intermediate directory(ies) as required.
-
-```
-Synopsis:
-	mkdir  ...
-
-Examples:
-	paddle pfs mkdir  /pfs/$DATACENTER/home/$USER/folder
-```
diff --git a/doc/design/file_manager/src/filemanager.graffle b/doc/design/file_manager/src/filemanager.graffle
deleted file mode 100644
index 7861a33072bc1908f69d12b37c20491dd8663103..0000000000000000000000000000000000000000
Binary files a/doc/design/file_manager/src/filemanager.graffle and /dev/null differ
diff --git a/doc/design/file_manager/src/filemanager.png b/doc/design/file_manager/src/filemanager.png
deleted file mode 100644
index 8139a19f5722f56d3c211f3ab0d3982f751134b9..0000000000000000000000000000000000000000
Binary files a/doc/design/file_manager/src/filemanager.png and /dev/null differ
diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8086507bb4b7e870ad6d6091945ed07a00b5100b
--- /dev/null
+++ b/doc/fluid/CMakeLists.txt
@@ -0,0 +1,55 @@
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_fluid_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output directory
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+    "${BINARY_BUILD_DIR_CN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_fluid_docs_cn
+                  html
+                  ${BINARY_BUILD_DIR_CN}
+                  ${SPHINX_CACHE_DIR_CN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_CN})
+
+add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)
+
+add_subdirectory(api)
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..48b396f0786adad1ba6cd41f72497f853e54bc38
--- /dev/null
+++ b/doc/fluid/api/CMakeLists.txt
@@ -0,0 +1,22 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_fluid_apis
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/v2/api/fluid/data_feeder.rst b/doc/fluid/api/data_feeder.rst
similarity index 100%
rename from doc/v2/api/fluid/data_feeder.rst
rename to doc/fluid/api/data_feeder.rst
diff --git a/doc/v2/api/fluid/evaluator.rst b/doc/fluid/api/evaluator.rst
similarity index 100%
rename from doc/v2/api/fluid/evaluator.rst
rename to doc/fluid/api/evaluator.rst
diff --git a/doc/v2/api/fluid/executor.rst b/doc/fluid/api/executor.rst
similarity index 100%
rename from doc/v2/api/fluid/executor.rst
rename to doc/fluid/api/executor.rst
diff --git a/doc/v2/api/fluid/gen_doc.py b/doc/fluid/api/gen_doc.py
similarity index 100%
rename from doc/v2/api/fluid/gen_doc.py
rename to doc/fluid/api/gen_doc.py
diff --git a/doc/v2/api/fluid/gen_doc.sh b/doc/fluid/api/gen_doc.sh
similarity index 100%
rename from doc/v2/api/fluid/gen_doc.sh
rename to doc/fluid/api/gen_doc.sh
diff --git a/doc/v2/api/fluid/index.rst b/doc/fluid/api/index_en.rst
similarity index 100%
rename from doc/v2/api/fluid/index.rst
rename to doc/fluid/api/index_en.rst
diff --git a/doc/v2/api/fluid/initializer.rst b/doc/fluid/api/initializer.rst
similarity index 100%
rename from doc/v2/api/fluid/initializer.rst
rename to doc/fluid/api/initializer.rst
diff --git a/doc/v2/api/fluid/io.rst b/doc/fluid/api/io.rst
similarity index 100%
rename from doc/v2/api/fluid/io.rst
rename to doc/fluid/api/io.rst
diff --git a/doc/v2/api/fluid/layers.rst b/doc/fluid/api/layers.rst
similarity index 99%
rename from doc/v2/api/fluid/layers.rst
rename to doc/fluid/api/layers.rst
index ae35d8c53476b34cb18331364267dd7c8b94dd64..22e6fb13d7320986a60bc1ef5530187e0970c767 100644
--- a/doc/v2/api/fluid/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -494,6 +494,12 @@ reshape
 ..  autofunction:: paddle.fluid.layers.reshape
     :noindex:
 
+pad
+---
+
+..  autofunction:: paddle.fluid.layers.pad
+    :noindex:
+
 scale
 -----
 
diff --git a/doc/v2/api/fluid/nets.rst b/doc/fluid/api/nets.rst
similarity index 100%
rename from doc/v2/api/fluid/nets.rst
rename to doc/fluid/api/nets.rst
diff --git a/doc/v2/api/fluid/optimizer.rst b/doc/fluid/api/optimizer.rst
similarity index 100%
rename from doc/v2/api/fluid/optimizer.rst
rename to doc/fluid/api/optimizer.rst
diff --git a/doc/v2/api/fluid/param_attr.rst b/doc/fluid/api/param_attr.rst
similarity index 100%
rename from doc/v2/api/fluid/param_attr.rst
rename to doc/fluid/api/param_attr.rst
diff --git a/doc/v2/api/fluid/profiler.rst b/doc/fluid/api/profiler.rst
similarity index 100%
rename from doc/v2/api/fluid/profiler.rst
rename to doc/fluid/api/profiler.rst
diff --git a/doc/v2/api/fluid/regularizer.rst b/doc/fluid/api/regularizer.rst
similarity index 100%
rename from doc/v2/api/fluid/regularizer.rst
rename to doc/fluid/api/regularizer.rst
diff --git a/doc/fluid/build_and_install/build_from_source_cn.rst b/doc/fluid/build_and_install/build_from_source_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..ae4e8c7c48e584ec16a7be5466f83dd154ffb5fb
--- /dev/null
+++ b/doc/fluid/build_and_install/build_from_source_cn.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/build_from_source_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/build_from_source_en.rst b/doc/fluid/build_and_install/build_from_source_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..1ac828c973826bb8374c4aa8e17fda3ea1bb939f
--- /dev/null
+++ b/doc/fluid/build_and_install/build_from_source_en.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/build_from_source_en.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/docker_install_cn.rst b/doc/fluid/build_and_install/docker_install_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..965b2e20559291989422938c418fadbac16941b9
--- /dev/null
+++ b/doc/fluid/build_and_install/docker_install_cn.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/docker_install_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/docker_install_en.rst b/doc/fluid/build_and_install/docker_install_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..79d7341a7bbb9e477c773134f24983fd7607769a
--- /dev/null
+++ b/doc/fluid/build_and_install/docker_install_en.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/docker_install_en.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..f697fcd8fac9131862ae7f8f51c5ebe93737ad2d
--- /dev/null
+++ b/doc/fluid/build_and_install/index_cn.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/index_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..502f66a41319d4f41ae1774628ca36da9dca76ce
--- /dev/null
+++ b/doc/fluid/build_and_install/index_en.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/index_en.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/pip_install_cn.rst b/doc/fluid/build_and_install/pip_install_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..07deca84b82ff553e0c19324695089dcfb6be90e
--- /dev/null
+++ b/doc/fluid/build_and_install/pip_install_cn.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/pip_install_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/pip_install_en.rst b/doc/fluid/build_and_install/pip_install_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..7f39c998195b719b05443e96f1c4a6a8d44b98c9
--- /dev/null
+++ b/doc/fluid/build_and_install/pip_install_en.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/pip_install_en.rst
\ No newline at end of file
diff --git a/doc/fluid/design/algorithm/index_cn.rst b/doc/fluid/design/algorithm/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0883a9dc9c457f393ac1bdc930cb47ebcb0a25d9
--- /dev/null
+++ b/doc/fluid/design/algorithm/index_cn.rst
@@ -0,0 +1,7 @@
+梯度更新算法
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  parameter_average.md
diff --git a/doc/fluid/design/algorithm/index_en.rst b/doc/fluid/design/algorithm/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..59fe68dcf79ce2ef90b9adc829a0db45a4f0b3dc
--- /dev/null
+++ b/doc/fluid/design/algorithm/index_en.rst
@@ -0,0 +1,7 @@
+Gradient Update Algorithm
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  parameter_average.md
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
index 2c4edee9fe31d502ea62b9fe5c8757c0a4c5e79f..940d37fb31dcd0c50ea6c4c42b052d7cb23a9c47 100644
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -5,9 +5,11 @@ In a large scale machine learning setup where the size of the training data is h
 
 Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
 
-Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for 
 . The averaging is done as follows:
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for 
 . The averaging is done as follows:
 
-
+
+
+
 
 We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
 
diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md
index bf0e4dddc1b640ecbce489f65820aaf8a4b3b1e7..8ded0ad22f4013a521bf3bee260565dc5cf855ae 100644
--- a/doc/fluid/design/concepts/README.md
+++ b/doc/fluid/design/concepts/README.md
@@ -2,15 +2,37 @@ A few months ago when we were trying to replace CMake with Bazel, @emailweixu su
 
 Here are some initial thoughts. Your comments are welcome!
 
-### Required CMake Function
+# Required CMake Function
 
 I think we need only the following few CMake functions to make a project description mean and clean:
 
-| C++ | CUDA C++ | Go |
-|---|---|---|
-| cc_library | nv_library | go_library |
-| cc_binary | nv_binary | go_binary |
-| cc_test | nv_test | go_test |
+
+
+
+| C+++ | CUDA C+++ | Go+ | 
+
+
+
+| cc_library+ | nv_library+ | go_library+ | 
+
+| cc_binary+ | nv_binary+ | go_binary+ | 
+
+| cc_test+ | nv_test+ | go_test+ | 
+
+
+
 
 - The `_library` functions generate  .a files from source code.
 - The `_binary` functions generate executable binary files.
@@ -25,7 +47,7 @@ Also,
 - to describe external dependencies, we need `external_library`.
 - to build shared libraries, we need `shared_library`.
 
-### An Example Project
+## An Example Project
 
 Suppose that we have aforementioned functions defined in our `/cmake` directory.  The following example `CMakeLists.txt` describes a project including the following source files:
 
@@ -102,11 +124,11 @@ shared_library(api
 
 ```
 
-### Implementation
+## Implementation
 
 As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
 
-### Using Package Manager For Go
+## Using Package Manager For Go
 
 Building Go binaries and libraries need to satisfy their dependencies, generally
 we can do `go get ./...` to download and compile all external dependencies. The
@@ -122,7 +144,7 @@ problems are:
    at many cloud file hosting, so users what to compile paddle by themselves can
    download this "vendor" package from a mirror site.
 
-#### Choose A Suitable Tool
+### Choose A Suitable Tool
 
 As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
 list dozens of Go package managers. We choose the tool using following principles:
@@ -140,7 +162,7 @@ management tool has been started at: https://github.com/golang/dep to resolve
 such problems, but it's currently at Alpha stage. So the best choice now is
 glide obviously.
 
-#### Manage Go Packages
+### Manage Go Packages
 
 - Dependencies: `go/glide.yaml` will store the dependencies and their versions which
   is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md
index 907a2def557fd472ac4d679c73447bd9107d1190..3b626bd89cd83a9428997abccfeeebbbbdbb3d38 100644
--- a/doc/fluid/design/concepts/block.md
+++ b/doc/fluid/design/concepts/block.md
@@ -14,11 +14,29 @@ In programming languages, a block is a pair of curly braces that includes local
 
 Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
 
-| programming languages | PaddlePaddle          |
-|-----------------------|-----------------------|
-| for, while loop       | RNN, WhileOp          |
-| if, if-else, switch   | IfElseOp, SwitchOp    |
-| sequential execution  | a sequence of layers  |
+
+
+
+| programming languages+ | PaddlePaddle+ | 
+
+
+
+| for, while loop+ | RNN, WhileOp+ | 
+
+| if, if-else, switch+ | IfElseOp, SwitchOp+ | 
+
+| sequential execution+ | a sequence of layers+ | 
+
+
+
 
 A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
 
@@ -26,12 +44,33 @@ A key difference is that a C++ program describes a one pass computation, whereas
 
 The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
 
-| programming languages | PaddlePaddle                    |
-|-----------------------|---------------------------------|
-| stack                 | scope hierarchy                 |
-| stack frame           | scope                           |
-| push at entering block| push at entering block          |
-| pop at leaving block  | destroy when minibatch completes|
+
+
+
+| programming languages+ | PaddlePaddle+ | 
+
+
+
+| stack+ | scope hierarchy+ | 
+
+| stack frame+ | scope+ | 
+
+| push at entering block+ | push at entering block+ | 
+
+| pop at leaving block+ | destroy when minibatch completes+ | 
+
+
+
 
 1. In traditional programs:
 
diff --git a/doc/fluid/design/concepts/cpp_data_feeding.md b/doc/fluid/design/concepts/cpp_data_feeding.md
index 8607b40ccbbe01db77afed72c1efa780b520744c..aabc1ba75a67c5767d409bd6e7e6240dec86b16c 100644
--- a/doc/fluid/design/concepts/cpp_data_feeding.md
+++ b/doc/fluid/design/concepts/cpp_data_feeding.md
@@ -113,7 +113,7 @@ To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an e
 
 To create and invoke readers, some new ops are introduced:
 
-### CreateReaderOp
+### Operators That Create Readers
 
 Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
 
@@ -153,19 +153,52 @@ double_buffer_reader = create_double_buffer_op(batch_reader)
 The forwarding ops of the corresponding `main_program` would be like this:
 
 ```
-while_op {
+not_completed = true
+pass_count = 0
+while_op(not_completed) {
     has_next = has_next_op(double_buffer_reader)
     if_else_op(has_next) {
         batch_data = read_op(double_buffer_reader)
         ... (subsequent training ops)
     } else {
         reset_op(double_buffer_reader)
+        increase_op(pass_count)
+        not_completed = less_than_op(pass_count, reqiured_pass_num)
     }
 }
 ```
 
-Two important considerations for these programs are as follows:
+A few important considerations for these programs are as follows:
 
-1. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
+1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables.
 
-2. All readers exist in both `startup_program` and `main_program`. And they are persistable.
+2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
+
+3. All readers exist in both `startup_program` and `main_program`. And they are persistable.
+
+### Simplify Configuration by MultiPassReader
+
+The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`.
+
+`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`.
+
+With `MultiPassReader`, the startup program would be like this:
+
+```
+multiple_reader = open_files_op(...)
+batch_reader = create_batch_reader_op(multiple_reader)
+multi_pass_reader = create_multi_pass_reader_op(batch_reader)
+double_buffer_reader = create_double_buffer_op(multi_pass_reader)
+... (other initializers)
+```
+
+The forwarding part of the corresponding `main_program` would be like this:
+
+```
+not_completed = true
+while_op(not_completed) {
+    batch_data = read_op(double_buffer_reader)
+    ... (subsequent training ops)
+    not_completed = has_next_op(double_buffer_reader)
+}
+```
diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
index 984b59f4c6971dfb6f46dfe342f2751f392c0e88..30bc488a18a28d349645d9d2502aae6691a69931 100644
--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -86,12 +86,40 @@ def layer.fc(X):
 
 We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
 
-
-| C++ functions/functors | mul          | add          |             |          |
-|------------------------|--------------|--------------|-------------|----------|
-| C++ operator class     | mulOp        | addOp        | FCOp        |          |
-| Python binding         | operator.mul | operator.add | operator.fc |          |
-| Python function        |              |              |             | layer.fc |
+
+
+
+| C++ functions/functors+ | mul+ | add+ | + | + | 
+
+
+
+| C++ operator class+ | mulOp+ | addOp+ | FCOp+ | + | 
+
+| Python binding+ | operator.mul+ | operator.add+ | operator.fc+ | + | 
+
+| Python function+ | + | + | + | layer.fc+ | 
+
+
 
 
 This is how we differentiate layer and operators in PaddlePaddle:
diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.dot b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
new file mode 100644
index 0000000000000000000000000000000000000000..40753cb140540c08d9d4c449b8d377e315280436
--- /dev/null
+++ b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
@@ -0,0 +1,83 @@
+digraph G {
+  subgraph cluster_init {
+    label="Initialization"
+    startup_program [label="startup", shape=box]
+    node_w_g0 [label="W\nGPU0"]
+    startup_program -> node_w_g0 [label="Initialize"]
+    node_w_g1 [label="W\nGPU1"]
+    node_w_g0 -> node_w_g1 [label="broadcast"]
+  }
+
+  subgraph cluster_train {
+    label="forward_backward"
+
+    subgraph cluster_gpu0 {
+      label="GPU0"
+      fc_0 [label="fc\nGPU0", shape=box]
+      hidden_0 [label="hidden\nGPU0"]
+      node_w_g0 -> fc_0
+      fc_0 -> hidden_0
+      loss0 [label="loss\nGPU0"]
+      hidden_0 -> loss0 [label="many ops omitted"]
+      scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box]
+      loss_g0 [label="loss_grad\nGPU0"]
+      scale_loss_0->loss_g0
+      
+      fc_g_0 [label="w_grad\nGPU0", shape=box]
+      loss0 -> fc_g_0
+      loss_g0 -> fc_g_0
+      hidden_0 -> fc_g_0
+    }
+
+    subgraph cluster_gpu1 {
+      label="GPU1"
+      fc_1 [label="fc\nGPU1", shape=box]
+      hidden_1 [label="hidden\nGPU1"]
+      node_w_g1 -> fc_1
+      fc_1 -> hidden_1
+      loss1 [label="loss\nGPU1"]
+      hidden_1 -> loss1 [label="many ops omitted"]
+      scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box]
+      loss_g1 [label="loss_grad\nGPU1"]
+      scale_loss_1->loss_g1
+      
+      fc_g_1 [label="w_grad\nGPU1", shape=box]
+      loss1 -> fc_g_1
+      loss_g1 -> fc_g_1
+      hidden_1 -> fc_g_1
+    }
+  }
+
+  all_reduce_w [label="Merge Gradients(AllReduce)", shape=box]
+  fc_g_0 -> all_reduce_w
+  fc_g_1 -> all_reduce_w
+
+  fc_g_0_merged [label="w_grad\nMerged\nGPU0"]
+  fc_g_1_merged [label="w_grad\nMerged\nGPU1"]
+  all_reduce_w -> fc_g_0_merged
+  all_reduce_w -> fc_g_1_merged
+
+  subgraph cluster_optimization {
+    label="Optimization"
+    subgraph cluster_opt_gpu0 {
+      label="GPU0"
+      sgd_0 [label="SGD Op\nGPU0", shape=box]
+
+      fc_g_0_merged -> sgd_0
+      node_w_g0 -> sgd_0
+      optimized_w_0 [label="Optimized W\nGPU0"]
+      sgd_0 -> optimized_w_0
+    }
+    subgraph cluster_opt_gpu1 {
+      label="GPU1"
+      sgd_1 [label="SGD Op\nGPU1", shape=box]
+
+      fc_g_1_merged -> sgd_1
+      node_w_g1 -> sgd_1
+      optimized_w_1 [label="Optimized W\nGPU0"]
+      sgd_1 -> optimized_w_1
+    }
+  }
+
+
+}
diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.png b/doc/fluid/design/concepts/images/parallel_executor_overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..d890c0ffee3b38dc7cb74a2b56c2ab4831532211
Binary files /dev/null and b/doc/fluid/design/concepts/images/parallel_executor_overview.png differ
diff --git a/doc/fluid/design/concepts/index_cn.rst b/doc/fluid/design/concepts/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dcdc894937ff328e6002623275ca3c65e87b2bb0
--- /dev/null
+++ b/doc/fluid/design/concepts/index_cn.rst
@@ -0,0 +1,19 @@
+核心概念
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  README.md
+  cpp_data_feeding.md
+  functions_operators_layers.md
+  program.md
+  variable.md
+  var_desc.md
+  tensor.md
+  tensor_array.md
+  lod_tensor.md
+  block.md
+  scope.md
+  executor.md
+  parallel_executor.md
diff --git a/doc/fluid/design/concepts/index_en.rst b/doc/fluid/design/concepts/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b85a3055746facaa642e8fc899976b58435f1ef2
--- /dev/null
+++ b/doc/fluid/design/concepts/index_en.rst
@@ -0,0 +1,19 @@
+Core Concepts
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  README.md
+  cpp_data_feeding.md
+  functions_operators_layers.md
+  program.md
+  variable.md
+  var_desc.md
+  tensor.md
+  tensor_array.md
+  lod_tensor.md
+  block.md
+  scope.md
+  executor.md
+  parallel_executor.md
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
index 10a8a7867fbf072f585fe3bfb1243e4e6bef4ec8..a88292e7888d0ebc64ee89ca315dfea38a12c71d 100644
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -2,12 +2,38 @@
 
 Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
 
-|                       | TensorFlow | PaddlePaddle |
-|-----------------------|------------|--------------|
-| RNN                   | Support    | Support      |
-| recursive RNN         | Support    | Support      |
-| padding zeros         | Must       | No need      |
-| blob data type        | Tensor     | LoDTensor    |
+
+
+
+| + | TensorFlow+ | PaddlePaddle+ | 
+
+
+
+| RNN+ | Support+ | Support+ | 
+
+| recursive RNN+ | Support+ | Support+ | 
+
+| padding zeros+ | Must+ | No need+ | 
+
+| blob data type+ | Tensor+ | LoDTensor+ | 
+
+
+
 
 PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
 
diff --git a/doc/fluid/design/concepts/parallel_executor.md b/doc/fluid/design/concepts/parallel_executor.md
new file mode 100644
index 0000000000000000000000000000000000000000..9aed3b059a1595ba3971d7d5acfc0d16a731584b
--- /dev/null
+++ b/doc/fluid/design/concepts/parallel_executor.md
@@ -0,0 +1,104 @@
+# ParallelExecutor
+
+## Background
+
+Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side.
+
+The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`.
+
+We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs. 
+
+ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs.
+
+
+## Overview of MultiGPUs logic
+
+The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters.
+
+
+
+There are several optimizations for this logic.
+
+1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization.
+2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready. 
+   * GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators.
+   * Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler.
+3. The streams of computation, merge gradients and fetch data are different.
+
+The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below.
+
+| Number of GPUs | 1 | 2 | 3 | 4|
+| --- | --- | --- | --- | --- |
+| Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 |
+| Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 |
+
+
+## Static single assignment Graph
+
+[Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`.
+
+The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph.
+
+The data structure of `SSA` graph is:
+
+```c++
+struct VarHandleBase {
+  OpHandleBase* generated_op_;
+  vector pending_ops_;
+  
+  string name;
+  Place place;
+  size_t version;
+};
+
+struct OpHandleBase {
+  vector inputs_;
+  vector outputs_;
+};
+
+struct SSAGraph {
+  // vars on each devices. 
+  //   * the vars in each map in vector is on different device.
+  //   * the map is mapping a variable name to variable handles
+  //   with different versions
+  vector>> vars_;
+  
+  // All ops
+  vector ops_;
+};
+```
+The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts.
+
+When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem.
+
+## Execute SSA Graph
+
+The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is
+
+1. Maintaining a map of an operator and its needed input number.
+2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators.
+3. If there is an operator which needed input number is decreased to zero, just run this operator.
+4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated.
+
+Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph.
+
+## Synchronize GPU Kernels
+
+The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm:
+
+1. `OpHandle` will record `DeviceContext` that it is used.
+2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
+
+The `wait` are implemented by two strategies:
+
+1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete.
+2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU.
+
+Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime.
+
+## What's next?
+
+* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done.
+* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too.
+* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision.
+* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator.
diff --git a/doc/fluid/design/concepts/scope.md b/doc/fluid/design/concepts/scope.md
index 4da76eebb74abcd26ec2b8671399e6bc4fb58574..dcf76649357aaef80d6bc1a933ece8c4c1063547 100644
--- a/doc/fluid/design/concepts/scope.md
+++ b/doc/fluid/design/concepts/scope.md
@@ -30,7 +30,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 
    Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
 
-1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. 
+1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else.
 
    Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
 
@@ -78,7 +78,7 @@ In `Scope` class, there is a private data member called `parent_`. `parent_` is
 
 A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
 
-# Interface Design
+## Interface Design
 
 ```cpp
 class Variable {
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
index 6a45af1995463402ba9c65ddb51c6c8bb107f99e..6750323c0167bf1efbde6ef4fd670e88a5aa502a 100644
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -1,3 +1,5 @@
+# Design Doc: Var_desc
+
 ## Background
 PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
 
@@ -8,10 +10,27 @@ PaddlePaddle uses proto message to describe compile time program because :
 
 The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.
 
-| |compile time|runtime|
-|---|---|---|
-|Data|VarDesc(proto)|Variable(cpp)|
-|Operation|OpDesc(proto)|Operator(cpp)|
+
+
+
+| + | compile time+ | runtime+ | 
+
+
+
+| Data+ | VarDesc(proto)+ | Variable(cpp)+ | 
+
+| Operation+ | OpDesc(proto)+ | Operator(cpp)+ | 
+
+
 
 
 ## Definition of VarType
diff --git a/doc/fluid/design/concurrent/channel.md b/doc/fluid/design/concurrent/channel.md
new file mode 100644
index 0000000000000000000000000000000000000000..df67438bcc741ac521b00ee962fc13c93db21182
--- /dev/null
+++ b/doc/fluid/design/concurrent/channel.md
@@ -0,0 +1,139 @@
+# Channel Design
+
+## Introduction
+
+A Channel is a data structure that allows for synchronous interprocess
+communication via message passing.  It is a fundemental component of CSP
+(communicating sequential processes), and allows for users to pass data
+between threads without having to worry about synchronization.
+
+## How to use it
+
+Paddle offers python APIs to open and close channels, along with sending
+and receiving data to/from a channel.
+
+### Create a channel
+
+Creates a new channel that takes in variables of a specific dtype.
+
+- **fluid.make_channel(dtype, capacity=0)**
+  - **dtype**: The data type of variables being sent/received through channel
+  - **capacity**: The capacity of the channel.  A capacity of 0 represents
+    an unbuffered channel.  Capacity > 0 represents a buffered channel
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR, 10)
+```
+
+### Close a channel
+
+Closes a channel.  Any pending senders and receivers will be awoken during
+this time.  Receivers can still receive from a closed channel, but senders
+are not allowed to send any additional data to the channel (Paddle will
+raise an exception if users try to send to a closed channel.)
+
+- **fluid.channel_close(channel)**
+
+```
+fluid.channel_close(ch)
+```
+
+### Send data to a channel
+
+Sends a variable to a channel.  Currently, variables of dtype `LoDTensor`,
+`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and
+`ChannelHolder` are supported.
+
+By default, the data of the Variable is moved from the sender to the receiver,
+however the user can optionally copy the data before performing the send.
+
+- **channel_send(channel, variable, is_copy=False)**
+  - **channel**: The channel to send the variable to
+  - **variable**: The variable to send to the channel
+  - **is_copy**: If set to True, channel_send will perform a variable assign
+  to copy the source variable to a new variable to be sent.
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
+fluid.channel_send(ch, var, True)
+```
+
+### Receive data from a channel
+
+Receives a variable from a channel.  The data of the variable is moved to the
+receiving variable.
+
+- **channel_recv(channel, return_variable)**
+  - **channel**: The channel to receive the variable from
+  - **return_variable**: The destination variable used to store the data of the
+  variable received from the channel
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
+fluid.channel_recv(ch, var)
+```
+
+## How it Works
+
+Channels provides a simple interface for different threads to share data.
+To support the synchronization requirements, channels utilizes a series of
+internal queues, locks, and conditional variables.
+
+### QueueMessage
+
+QueueMessage encapsulates the state of the channel send/receive operation to be
+put in the **sendq/recvq**.  It contains a condition variable used to lock the
+thread (when there are no available sends/receives).  In addition, it contains
+a callback function to notify a thread when the QueueMessage is being
+processed by the channel.
+
+### Queues
+
+- **buff_**: This queue holds the data buffer in a buffered channel.  The
+capacity is set to the capacity of the channel.  This data buffer is not
+used in an unbuffered channel.
+
+- **sendq**: This queue holds the QueueMessage of any pending senders of a
+channel.  When a thread performs a channel_send operation on the channel, the
+channel_send operation will put a new QueueMessage on the sendq and block the
+current thread under two conditions:
+  1. The channel is buffered and is full
+  2. The channel is unbuffered and does not have a receiver
+
+- **recvq**:  This queue holds the QueueMessage of any pending receivers of a
+channel.  When a thread performs a channel_recv operation on the channel, the
+channel_recv operation will put a new QueueMessage on the recvq and block the
+current thread under two conditions:
+  1. The channel is buffered and there is no data on the buff_
+  2. The channel is unbuffered and does not have a sender
+
+### State diagram
+
+#### Channel Send
+
+
+
+
+
+#### Channel Receive
+
+
+
+
+
+## Limitations and Considerations
+
+### Variable Copy
+
+In golang, variables in channels are copied from the sender to the receiver.
+In Paddle, the data from our variables are **moved** from sender to receiver.
+As a result, these variables should not be used after they are sent.  We
+provide a flag in channel_send method to allow users to copy the variable to
+be sent before it is sent.  
+
+Please note that this is acheived by adding an **assign** operator and creating
+a temporary variable that is sent in place of the original variable.  Please
+note that **assign** operator has limited support for only certain variables
+datatypes.
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
index f022e67fd3a048cd7e53c91d9a1fd0506487b665..1859f983e9133674e69ecd506d7683ea926b2b8f 100644
--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -10,12 +10,42 @@ The answer relies on the fact that a `ProgramDesc` is similar to an abstract syn
 
 The following table compares concepts in Fluid and Go
 
-| Go | Fluid |
-|----|-------|
-|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) |
-| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
-| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
-| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
+
+
 
 ## An Example Concurrent Program
 
@@ -77,11 +107,11 @@ message ProgramDesc {
       read(output = X)
       kube_get_workers_addrs(output = L)
       Y = tensor_array(len(L))
-      parallel_for(input = X, output = Y, 
+      parallel_for(input = X, output = Y,
                    attrs = {L, block_id(1)}) # referring to block 1
     ]
   }
-  
+
   block[1] = Block {
     parent = 0,
     vars = [x, y, index],
@@ -102,7 +132,7 @@ func main() {  //// block 0
   X = fluid.read(...)
   L = fluid.k8s.get_worker_addrs()
   Y = fluid.tensor_array(len(L))
-  fluid.parallel_for(X, L, 
+  fluid.parallel_for(X, L,
                      func(index int) {  //// block 1
                        x = X[index]
                        fluid.send(L[index], x)
@@ -116,7 +146,7 @@ An explanation of the above program:
 
 - `fluid.k8s` is a package that provides access to Kubernetes API.  
 - `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
-- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, 
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
 
   1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
   2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md
index 10d936860fab7e09241e968a63526c7d86d3e568..66d19f44baf861c7847e81ca83f61024ec877faf 100644
--- a/doc/fluid/design/concurrent/csp.md
+++ b/doc/fluid/design/concurrent/csp.md
@@ -13,14 +13,41 @@ Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously exe
 
 There were many concurrent programming models, implemented in various forms:
 
-| concurrent programming model | implementation |
-|-----|-----|
-| mutex | types and functions in standard libraries |
-| semaphore | types and functions in standard libraries |
-| communicating sequential processes (CSP) | Go programming language |
-| actor model | Erlang programming language |
-| message passing | MPI |
-| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
+
+
+
+| concurrent programming model+ | implementation+ | 
+
+
+
+| mutex+ | types and functions in standard libraries+ | 
+
+| semaphore+ | types and functions in standard libraries+ | 
+
+| communicating sequential processes (CSP)+ | Go programming language+ | 
+
+| actor model+ | Erlang programming language+ | 
+
+| message passing+ | MPI+ | 
+
+| bulk synchronous parallel (BSP)+ | Pregel distributed programming framework+ | 
+
+
+
 
 Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
 
@@ -118,9 +145,9 @@ There are four types of actions with a channel:
    ```go
    close(ch)
    ```
-   
+
    Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
-   
+
 There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
 
 1. A send to a nil channel blocks forever
diff --git a/doc/fluid/design/concurrent/go_op.md b/doc/fluid/design/concurrent/go_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..c18b788e80f432ebb2f14b15229e7823c112001e
--- /dev/null
+++ b/doc/fluid/design/concurrent/go_op.md
@@ -0,0 +1,231 @@
+# go_op Design
+
+## Introduction
+
+The **go_op** allows user's of PaddlePaddle to run program blocks on a detached
+thread.  It works in conjuction with CSP operators (channel_send, 
+channel_receive, channel_open, channel_close, and select) to allow users to
+concurrently process data and communicate easily between different threads.
+
+## How to use it
+
+```
+channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+with fluid.Go():
+    # Send a tensor of value 99 to "channel" on a detached thread
+    tensor = fill_constant(shape=[1], dtype='int', value=99)
+    tensor.stop_gradient = True
+    fluid.channel_send(channel, tensor)
+    
+# Receive sent tensor from "channel" on the main thread
+result = fill_constant(shape=[1], dtype='int', value=-1)    
+fluid.channel_recv(ch, result)  
+```
+
+The go operator can be accessed by using the fluid.Go() control flow.  This
+will create a new sub block, where the user can add additional operators
+to be ran on the thread.
+
+**Note:** Since back propegation is currently not support in the go_op, users
+should ensure that operators in the go block does not require gradient 
+calculations.
+
+## How it Works
+
+Similar to other control blocks, go_op will create a sub block and add it
+as a child to the current block.  Operators and variables defined in this
+block will be added to the go sub_block.
+
+In addition, the go operator will create a new child scope whose parent is
+the global scope.  Please refer to [block captures](#block-captures) for more
+information.
+
+When Paddle executor runs go_op, go_op will take the sub_block and pass it to
+the executor.run method (along with a newly created local scope) on a detached
+thread.
+
+An example of the generated program description is shown below.  Take note of
+the **go_op** in particular.  It is added as an operator in the current 
+block (in this example, block0).  The **go_op** contains a `sub_block`
+attribute, which points to the id of the block that will be executed in a 
+detached thread.
+
+```
+blocks {
+  idx: 0
+  parent_idx: -1
+  vars {
+    name: "return_value"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: INT64
+        }
+      }
+    }
+  }
+  vars {
+    name: "status_recv"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "channel"
+    }
+    type: "channel_create"
+    attrs {
+      name: "data_type"
+      type: INT
+      i: 7
+    }
+    attrs {
+      name: "capacity"
+      type: INT
+      i: 0
+    }
+  }
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "channel"
+    }
+    type: "go"
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 1
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "return_value"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status_recv"
+    }
+    type: "channel_recv"
+  }
+  ...
+}
+
+blocks {
+  idx: 1
+  parent_idx: 0
+  vars {
+    name: "status"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: 99.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 3
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status"
+    }
+    type: "channel_send"
+    attrs {
+      name: "copy"
+      type: BOOLEAN
+      b: false
+    }
+  }
+```
+
+## Current Limitations
+
+#### Scopes and block captures:
+
+Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a
+block.  When a block is executed, a new local scope is created from the parent
+scope (ie: scope derived from the parent block) and associated with the new 
+child block.  After the block finishes executing, then the local scope and
+all associated variables in the scope is deleted.
+
+This works well in a single threaded scenario, however with introduction of
+go_op, a child block may continue to execute even after the parent block has
+exited.  If the go_op tries to access variables located in the parent block's
+scope, it may receive a segmentation fault because the parent scope may have
+been deleted.
+
+We need to implement block closures in order to prevent access to parent
+scope variables from causing a segmentation fault.  As a temporary workaround,
+please ensure that all variables accessed in the go block is not destructed
+before it is being accessed.  Currently, the go_op will explicitly enforce 
+this requirement and raise an exception if a variable could not be found in 
+the scope.
+
+Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502)
+for more details.
+
+#### Green Threads
+
+Golang utilizes `green threads`, which is a mechnism for the runtime library to 
+manage multiple threads (instead of natively by the OS).  Green threads usually
+allows for faster thread creation and switching, as there is less overhead
+when spawning these threads.  For the first version of CSP, we only support
+OS threads.
+
+
+#### Backward Propegation:
+
+go_op currently does not support backwards propagation.  Please use go_op with
+non training operators.
diff --git a/doc/fluid/design/concurrent/images/channel_recv.png b/doc/fluid/design/concurrent/images/channel_recv.png
new file mode 100644
index 0000000000000000000000000000000000000000..c06cd15ae7b8a8c94d5742f6675e389081fcf789
Binary files /dev/null and b/doc/fluid/design/concurrent/images/channel_recv.png differ
diff --git a/doc/fluid/design/concurrent/images/channel_send.png b/doc/fluid/design/concurrent/images/channel_send.png
new file mode 100644
index 0000000000000000000000000000000000000000..006ebb4a5a4bcd32c97847e9fb7729a740255f7c
Binary files /dev/null and b/doc/fluid/design/concurrent/images/channel_send.png differ
diff --git a/doc/fluid/design/concurrent/index_cn.rst b/doc/fluid/design/concurrent/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e47135e9fc42760898083710e0a6767252a0225b
--- /dev/null
+++ b/doc/fluid/design/concurrent/index_cn.rst
@@ -0,0 +1,8 @@
+并发编程
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  concurrent_programming.md
+  parallel_do.md
diff --git a/doc/fluid/design/concurrent/index_en.rst b/doc/fluid/design/concurrent/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0727e75798b2a869588f80d3cce7a886554e4ffb
--- /dev/null
+++ b/doc/fluid/design/concurrent/index_en.rst
@@ -0,0 +1,8 @@
+Concurrent Programming
+-------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  concurrent_programming.md
+  parallel_do.md
diff --git a/doc/fluid/design/concurrent/select_op.md b/doc/fluid/design/concurrent/select_op.md
index 52c226bc94a4e8bfc5588705d7f65328840e91cc..4fcae57cc7932cdaebe549486e7f7cebf0bd038a 100644
--- a/doc/fluid/design/concurrent/select_op.md
+++ b/doc/fluid/design/concurrent/select_op.md
@@ -2,13 +2,13 @@
 
 ## Introduction
 
-In golang, the [**select**](https://golang.org/ref/spec#Select_statements) 
-statement lets a goroutine wait on multiple communication operations at the 
-same time. The **select** blocks until one of its cases can run, then 
-executes the case.  If multiple cases are ready to run, then one case is 
+In golang, the [**select**](https://golang.org/ref/spec#Select_statements)
+statement lets a goroutine wait on multiple communication operations at the
+same time. The **select** blocks until one of its cases can run, then
+executes the case.  If multiple cases are ready to run, then one case is
 choosen at random to be executed.
 
-With the introduction of CSP for Paddle, we mimic this behavior by 
+With the introduction of CSP for Paddle, we mimic this behavior by
 creating a ***select_op***.
 
 ## How to use it
@@ -17,11 +17,11 @@ The **select_op** is available as a c++ operator.  However most users
 will prefer to use the much simplier Python API.
 
 - **fluid.Select()**: Creates a select operator and adds it to the current
-block within the main program.  Also creates a sub block and adds it to the 
-main program.  This sub block is used to hold all variables and operators 
+block within the main program.  Also creates a sub block and adds it to the
+main program.  This sub block is used to hold all variables and operators
 used by the case statements.
- 
-Within the select block, users can add cases by 
+
+Within the select block, users can add cases by
 calling **select.case** or **select.default** method.
 
 - **fluid.Select.case(channel_action, channel, result_variable)**: Represents
@@ -37,13 +37,13 @@ execute.
 ```
 ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
 quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-            
+
 x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
 y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
- 
+
 while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
 while_op = While(cond=while_cond)    
- 
+
 with while_op.block():
     with fluid.Select() as select:
         with select.case(fluid.channel_send, channel, x):
@@ -99,17 +99,17 @@ blocks {
     }
   }
   // Create "select" operator.
-  // inputs: 
+  // inputs:
   //   X: All input variables used by operators within the select block
   //   case_to_execute: Variable filled in by select_op when it determines
   //     which case to execute.
   //  
   // outputs:
-  //   Out: All output variables referenced by operators within select block. 
-  // 
+  //   Out: All output variables referenced by operators within select block.
+  //
   // attrs:
   //   sub_block: The block id containing the select "cases"
-  //   cases:  Serialized list of all cases in the select op. 
+  //   cases:  Serialized list of all cases in the select op.
   //     Each case is serialized as: ',,,'
   //     where type is 0 for default, 1 for send, and 2 for receive.
   //     No channel and values are needed for default cases.
@@ -150,7 +150,7 @@ into **X**.  It will also create a temp variable called **case_to_execute**.  Th
 filled in by the select_op after it has completed processing the case statements.
 
 If there are no available cases to execute (ie: all cases are blocked on channel operations, and
-there is no default statement), then the select_op will block the current thread.  The thread will 
+there is no default statement), then the select_op will block the current thread.  The thread will
 unblock once there is a channel operation affecting one of the case statements, at which point, the
 **select_op** will set the **case_to_execute** variable to the index of the case to execute.
 
@@ -247,17 +247,17 @@ blocks {
 
 ```
 
-Cases are represented by a **conditional_block operator**, whose's condition is set as the output of 
-equal(**case_to_execute**, **case_index**).  Since each case index is unique in this sub-block, 
+Cases are represented by a **conditional_block operator**, whose's condition is set as the output of
+equal(**case_to_execute**, **case_index**).  Since each case index is unique in this sub-block,
 only one case will be executed.
 
 ### select_op flow
 
 
-
+
 
 
-The select algorithm is inspired by golang's select routine.  Please refer to 
+The select algorithm is inspired by golang's select routine.  Please refer to
 http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information.
 
 ## Backward Pass
diff --git a/doc/fluid/design/data_type/index_cn.rst b/doc/fluid/design/data_type/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b60167b6b1599df69dfc5073ebf32bdbb0a316ec
--- /dev/null
+++ b/doc/fluid/design/data_type/index_cn.rst
@@ -0,0 +1,7 @@
+数据类型
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  float16.md
diff --git a/doc/fluid/design/data_type/index_en.rst b/doc/fluid/design/data_type/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a88d17943f49134a2d00363845e919537ff4545
--- /dev/null
+++ b/doc/fluid/design/data_type/index_en.rst
@@ -0,0 +1,7 @@
+Data Type
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  float16.md
diff --git a/doc/fluid/design/dist_train/distributed_architecture.md b/doc/fluid/design/dist_train/distributed_architecture.md
index a405cb6aaf80b9d2e8a1a9c774ca85cc7e62bbab..229cb47c17d633be6848bb35e58d33ec9b47ec3b 100644
--- a/doc/fluid/design/dist_train/distributed_architecture.md
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@@ -40,11 +40,11 @@ computation is only specified in Python code which sits outside of PaddlePaddle,
 
 Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
 
- +
+ PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
 
-
 
 PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
 
- +
+ The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
 
@@ -60,7 +60,7 @@ For a detailed explanation, refer to this document -
 
 The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
 
-
 
 The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
 
@@ -60,7 +60,7 @@ For a detailed explanation, refer to this document -
 
 The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
 
- +
+ The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
 
@@ -152,7 +152,7 @@ for data in train_reader():
 `JobDesc` object describe the distributed job resource specification to run on
 Cluster environment.
 
-
 
 The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
 
@@ -152,7 +152,7 @@ for data in train_reader():
 `JobDesc` object describe the distributed job resource specification to run on
 Cluster environment.
 
- +
+ `RemoteExecutor.run` sends the `ProgramDesc` and
 [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
@@ -171,7 +171,7 @@ In the future, a more general placement algorithm should be implemented, which m
 
 The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
 
-
 
 `RemoteExecutor.run` sends the `ProgramDesc` and
 [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
@@ -171,7 +171,7 @@ In the future, a more general placement algorithm should be implemented, which m
 
 The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
 
- +
+ ### Training Data
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
index e543adf0f97cc6b47415b807d7a1ed1effec9b22..988729138926f035750b59eb245dde82502a3ad2 100644
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -1,4 +1,4 @@
-## Design Doc: Distributed Lookup Table Operator
+# Design Doc: Distributed Lookup Table Operator
 
 A lookup table operator in PaddlePaddle where the table could be out
 of the memory of a computer.
diff --git a/doc/fluid/design/dist_train/index_cn.rst b/doc/fluid/design/dist_train/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed6f3dda271d2de58d92aa7ec804fa9e68dfc48a
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_cn.rst
@@ -0,0 +1,9 @@
+分布式训练
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
diff --git a/doc/fluid/design/dist_train/index_en.rst b/doc/fluid/design/dist_train/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f84688f168021113bd933802709bcd787b474bca
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_en.rst
@@ -0,0 +1,9 @@
+Distributed Training
+---------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
diff --git a/doc/fluid/design/dist_train/multi_cpu.md b/doc/fluid/design/dist_train/multi_cpu.md
index a8d8ee0422acc84835170a44eb83f9b5f0c6bb40..38222d083084ebfca3099ce96b47868c42d55101 100644
--- a/doc/fluid/design/dist_train/multi_cpu.md
+++ b/doc/fluid/design/dist_train/multi_cpu.md
@@ -8,11 +8,11 @@ Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
 
 ## Transpiler
 
-
 
 
 ### Training Data
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
index e543adf0f97cc6b47415b807d7a1ed1effec9b22..988729138926f035750b59eb245dde82502a3ad2 100644
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -1,4 +1,4 @@
-## Design Doc: Distributed Lookup Table Operator
+# Design Doc: Distributed Lookup Table Operator
 
 A lookup table operator in PaddlePaddle where the table could be out
 of the memory of a computer.
diff --git a/doc/fluid/design/dist_train/index_cn.rst b/doc/fluid/design/dist_train/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed6f3dda271d2de58d92aa7ec804fa9e68dfc48a
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_cn.rst
@@ -0,0 +1,9 @@
+分布式训练
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
diff --git a/doc/fluid/design/dist_train/index_en.rst b/doc/fluid/design/dist_train/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f84688f168021113bd933802709bcd787b474bca
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_en.rst
@@ -0,0 +1,9 @@
+Distributed Training
+---------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
diff --git a/doc/fluid/design/dist_train/multi_cpu.md b/doc/fluid/design/dist_train/multi_cpu.md
index a8d8ee0422acc84835170a44eb83f9b5f0c6bb40..38222d083084ebfca3099ce96b47868c42d55101 100644
--- a/doc/fluid/design/dist_train/multi_cpu.md
+++ b/doc/fluid/design/dist_train/multi_cpu.md
@@ -8,11 +8,11 @@ Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
 
 ## Transpiler
 
- +
+ After converted:
 
-
 
 After converted:
 
- +
+ ## Implement
 
diff --git a/doc/fluid/design/dist_train/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md
index 6ce48dfbfce8b094684b412ebfda7e505ddc30ae..73c85da5e89eee0ac7857a0b808bc64ae673fdad 100644
--- a/doc/fluid/design/dist_train/parameter_server.md
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -41,11 +41,11 @@ We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
 Below is an example of converting the user defined graph to the
 subgraphs for the trainer and the parameter server:
 
-
 
 ## Implement
 
diff --git a/doc/fluid/design/dist_train/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md
index 6ce48dfbfce8b094684b412ebfda7e505ddc30ae..73c85da5e89eee0ac7857a0b808bc64ae673fdad 100644
--- a/doc/fluid/design/dist_train/parameter_server.md
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -41,11 +41,11 @@ We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
 Below is an example of converting the user defined graph to the
 subgraphs for the trainer and the parameter server:
 
- +
+ After converting:
 
-
 
 After converting:
 
- +
+ 1. The parameter variable W and its optimizer program are placed on the parameter server.
 1. Operators are added to the program.
@@ -69,8 +69,7 @@ In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list o
 non-zero gradient data. So when we do parameter optimization both locally and remotely,
 we only need to send those non-zero rows to the optimizer operators:
 
-
 
 1. The parameter variable W and its optimizer program are placed on the parameter server.
 1. Operators are added to the program.
@@ -69,8 +69,7 @@ In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list o
 non-zero gradient data. So when we do parameter optimization both locally and remotely,
 we only need to send those non-zero rows to the optimizer operators:
 
- -
+
-
+ ### Benefits
 
 - Model parallelism becomes easier to implement: it is an extension to
diff --git a/doc/fluid/design/dynamic_rnn/index_cn.rst b/doc/fluid/design/dynamic_rnn/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1d224d22cf7103616f44115db01f0ae55f1cb88a
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_cn.rst
@@ -0,0 +1,8 @@
+动态RNN
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  rnn.md
+  rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/index_en.rst b/doc/fluid/design/dynamic_rnn/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..568f496e4ffe21a5e730488aef905f7e2d98839e
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_en.rst
@@ -0,0 +1,8 @@
+Dynamic RNN
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  rnn.md
+  rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md
index 2f4854793fa1f0b02e4dc17b51a48a972be61c06..7b61b050f640814d6949cf6847b431da53d59581 100644
--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -5,7 +5,7 @@ This document describes the RNN (Recurrent Neural Network) operator and how it i
 ## RNN Algorithm Implementation
 ### Benefits
 
 - Model parallelism becomes easier to implement: it is an extension to
diff --git a/doc/fluid/design/dynamic_rnn/index_cn.rst b/doc/fluid/design/dynamic_rnn/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1d224d22cf7103616f44115db01f0ae55f1cb88a
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_cn.rst
@@ -0,0 +1,8 @@
+动态RNN
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  rnn.md
+  rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/index_en.rst b/doc/fluid/design/dynamic_rnn/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..568f496e4ffe21a5e730488aef905f7e2d98839e
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_en.rst
@@ -0,0 +1,8 @@
+Dynamic RNN
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  rnn.md
+  rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md
index 2f4854793fa1f0b02e4dc17b51a48a972be61c06..7b61b050f640814d6949cf6847b431da53d59581 100644
--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -5,7 +5,7 @@ This document describes the RNN (Recurrent Neural Network) operator and how it i
 ## RNN Algorithm Implementation
 
 
- +
+ 
 
 
 The above diagram shows an RNN unrolled into a full network.
@@ -22,7 +22,7 @@ There are several important concepts here:
 There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
 
 
-
+
 Figure 2 illustrates the RNN's data flow
 
 
@@ -49,7 +49,7 @@ or copy the memory value of the previous step to the current ex-memory variable.
 
 ### Usage in Python
 
-For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md).
 
 We can define an RNN's step-net using a Block:
 
@@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par
 The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
 
 
- +
+ 
 
 
 ```python
@@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st
 
 
 
- +
+ 
 
diff --git a/doc/fluid/design/dynamic_rnn/rnn_design.md b/doc/fluid/design/dynamic_rnn/rnn_design.md
index 3d38b9a0ad225fd8e0c1bb037474b292b1887f5b..cecfcd3307ae4c4fa603220a360e9e124069fa58 100644
--- a/doc/fluid/design/dynamic_rnn/rnn_design.md
+++ b/doc/fluid/design/dynamic_rnn/rnn_design.md
@@ -99,7 +99,7 @@ private:
     - 由于传递过程是以复制`shared_ptr`的方式实现,因此框架只需要传递一次 `lod_start_pos`
 
 2. 对于不感知 `lod_start_pos` 的Op足够透明
-3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 
+3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据
 
 具体的设计分为以下3小节
 
@@ -189,7 +189,7 @@ struct SortedSeqItem {
 
 std::vector sorted_seqs;
 ```
-来追踪序列排序后的位置,并添加一个新的接口 
+来追踪序列排序后的位置,并添加一个新的接口
 
 ```c++
 std::vector SortBySeqLen(const LODTensor& tensor);
@@ -233,7 +233,10 @@ x    x
 - 将每个序列concat 为规则的mini-batch表示
 
 ## 参考文献
-1. [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
-2. [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
-3. [variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
-4. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
+[Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
+
+[mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
+
+[variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
+
+[Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/doc/fluid/design/execution/index_cn.rst b/doc/fluid/design/execution/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed31b017429d168b2466d8f6b423f48bd5d78d1f
--- /dev/null
+++ b/doc/fluid/design/execution/index_cn.rst
@@ -0,0 +1,8 @@
+执行流程
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  switch.md
+  if_else_op.md
diff --git a/doc/fluid/design/execution/index_en.rst b/doc/fluid/design/execution/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fcf846da348ff0bed707c42718e08314998fbac0
--- /dev/null
+++ b/doc/fluid/design/execution/index_en.rst
@@ -0,0 +1,8 @@
+Execution Process
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  switch.md
+  if_else_op.md
diff --git a/doc/fluid/design/execution/switch.md b/doc/fluid/design/execution/switch.md
index 827d0601c621e4a230de28e2baad8e196e69625e..1c337bd7159b25e594c2f91f9a143b3f4bc3c8e8 100644
--- a/doc/fluid/design/execution/switch.md
+++ b/doc/fluid/design/execution/switch.md
@@ -1,6 +1,6 @@
-### Design Doc: Switch
+# Design Doc: Switch
 
-### Background
+## Background
 
 Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
 
@@ -19,7 +19,7 @@ with switch() as switch:
         fluid.print("Case 3")
 ```
 
-### The Semantics
+## The Semantics
 
 1. A `switch` control-flow checks cases one-by-one.
 1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e9f55214f411abb11bef180d7af4716ad85a0b09
--- /dev/null
+++ b/doc/fluid/design/index_cn.rst
@@ -0,0 +1,19 @@
+设计思想
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  motivation/index_cn.rst
+  execution/index_cn.rst
+  concepts/index_cn.rst
+  data_type/index_cn.rst
+  memory/index_cn.rst
+  muti_devices/index_cn.rst
+  dynamic_rnn/index_cn.rst
+  concurrent/index_cn.rst
+  algorithm/index_cn.rst
+  network/index_cn.rst
+  modules/index_cn.rst
+  interface/index_cn.rst
+  dist_train/index_cn.rst
diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2802dc3a31d540c5a19bf9042053496aad152f98
--- /dev/null
+++ b/doc/fluid/design/index_en.rst
@@ -0,0 +1,19 @@
+Design
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  motivation/index_en.rst
+  execution/index_en.rst
+  concepts/index_en.rst
+  data_type/index_en.rst
+  memory/index_en.rst
+  muti_devices/index_en.rst
+  dynamic_rnn/index_en.rst
+  concurrent/index_en.rst
+  algorithm/index_en.rst
+  network/index_en.rst
+  modules/index_en.rst
+  interface/index_en.rst
+  dist_train/index_en.rst
diff --git a/doc/fluid/design/interface/index_cn.rst b/doc/fluid/design/interface/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..69a8d9bad4fe88935b9fa87757abf0105ca8eb75
--- /dev/null
+++ b/doc/fluid/design/interface/index_cn.rst
@@ -0,0 +1,4 @@
+多语言接口
+------------
+
+TBD
diff --git a/doc/fluid/design/interface/index_en.rst b/doc/fluid/design/interface/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..22abc71f984aa5da7151d5ebf0c3bdbcc69a3624
--- /dev/null
+++ b/doc/fluid/design/interface/index_en.rst
@@ -0,0 +1,4 @@
+Multi-Language Interface
+-----------------------
+
+TBD
diff --git a/doc/fluid/design/memory/index_cn.rst b/doc/fluid/design/memory/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c507c638bd1a6eb428175ed2756a6ecfc6cca198
--- /dev/null
+++ b/doc/fluid/design/memory/index_cn.rst
@@ -0,0 +1,7 @@
+内存管理
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  memory_optimization.md
diff --git a/doc/fluid/design/memory/index_en.rst b/doc/fluid/design/memory/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f7526437a73a09b300f05e138084755f5528b242
--- /dev/null
+++ b/doc/fluid/design/memory/index_en.rst
@@ -0,0 +1,7 @@
+Memory Management
+-------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  memory_optimization.md
diff --git a/doc/fluid/design/modules/batch_norm_op.md b/doc/fluid/design/modules/batch_norm_op.md
index d1392619c42d9206bf4bddcd33ad11b033e6cbdb..e451ffcc73b5de2b911e1c6de54b42a5d1d54c37 100644
--- a/doc/fluid/design/modules/batch_norm_op.md
+++ b/doc/fluid/design/modules/batch_norm_op.md
@@ -2,7 +2,7 @@
 
 ## What is batch normalization
 
-Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training. 
+Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training.
 
 The principle of batch normalization can be summarized into a simple function:
 
@@ -66,7 +66,7 @@ As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attribu
 
 The following graph showes the training computational process of `batch_norm_op`:
 
- +
+ cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
 
@@ -74,13 +74,13 @@ cudnn provides APIs to finish the whole series of computation, we can use them i
 
 `batch_norm_op` is warpped as a layer in Python:
 
-```python 
-def batch_norm_layer(net, 
+```python
+def batch_norm_layer(net,
                      input,
-                     output, 
-                     scale, 
-                     bias, 
-                     use_global_est = False, 
+                     output,
+                     scale,
+                     bias,
+                     use_global_est = False,
                      epsilon = 1e-6,
                      momentum = 0.99):
 	mean_cache = scope.new_var(name = 'estimated_mean', trainable = False)
@@ -119,15 +119,15 @@ for pass_id in range(PASS_NUM):
     if pass_id % 100 == 0:
         net.infer(test_image)    # run inferencing model
     # ...
-``` 
+```
 
 `is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
 
 cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
 
@@ -74,13 +74,13 @@ cudnn provides APIs to finish the whole series of computation, we can use them i
 
 `batch_norm_op` is warpped as a layer in Python:
 
-```python 
-def batch_norm_layer(net, 
+```python
+def batch_norm_layer(net,
                      input,
-                     output, 
-                     scale, 
-                     bias, 
-                     use_global_est = False, 
+                     output,
+                     scale,
+                     bias,
+                     use_global_est = False,
                      epsilon = 1e-6,
                      momentum = 0.99):
 	mean_cache = scope.new_var(name = 'estimated_mean', trainable = False)
@@ -119,15 +119,15 @@ for pass_id in range(PASS_NUM):
     if pass_id % 100 == 0:
         net.infer(test_image)    # run inferencing model
     # ...
-``` 
+```
 
 `is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
 
 
-

+
 
 
+
+
+| Python classes+ | Protobuf messages+ | 
+
+
+
+| Program+ | ProgramDesc+ | 
+
+| Block+ | BlockDesc+ | 
+
+| Operator+ | OpDesc+ | 
+
+| Variable+ | VarDesc+ | 
+
+
+
 
 Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
 
diff --git a/doc/fluid/design/modules/regularization.md b/doc/fluid/design/modules/regularization.md
index 21280ac898feb4dd5e5a5d9e88d121e856850f0b..8cd5ff71d193f03e1ac923724b52f28c6057d25d 100644
--- a/doc/fluid/design/modules/regularization.md
+++ b/doc/fluid/design/modules/regularization.md
@@ -6,23 +6,23 @@ A central problem in machine learning is how to design an algorithm that will pe
 ### Parameter Norm Penalties
 Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
 
-
+
 
 The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
 
 The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
 
 ##### L2 Regularization:
-
+
 
 ##### L1 Regularization
-
+
 
 A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
 
 ## Regularization Survey
 
-A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey). 
+A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey).
 
 ## Proposal for Regularization in PaddlePaddle
 
@@ -32,41 +32,35 @@ In the new design, we propose to create new operations for regularization. For n
 - L2_regularization_op
 - L1_regularization_op
 
-These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties. 
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
 
-The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. 
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
 
 ### Computation Graph
 
 Below is an example of a really simple feed forward neural network.
 
-
+
 
 The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
 
-
+
    
 ### Python API implementation for Regularization
 
-Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. 
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
 
 #### Creation of Regularization ops
 There are two possibilities for creating the regularization ops:
-1. We create these ops immediately while building the computation graph. 
-2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. 
+1. We create these ops immediately while building the computation graph.
+2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added.
 
-The proposal is to add these ops in a lazy manner just before the backward pass. 
+The proposal is to add these ops in a lazy manner just before the backward pass.
 
 #### Storage of Regularization attributes
 
-Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. 
+Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters.
 
 #### High-level API
 
 In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
-
-
-
-
-
-    
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
index 110b7d78bf12ac8328fb3a913e4386e75d63c995..5e147f8263e685a4665b5793f7127178cbc3cfdd 100644
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@@ -10,11 +10,37 @@ Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution
 
 Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
 
-| Existed since | model as sequence of layers | model as graph of operators | No model |
-|--|--|--|--|
-| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
-| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
-| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+
+
+
+| Existed since+ | model as sequence of layers+ | model as graph of operators+ | No model+ | 
+
+
+
+| 2013+ | Caffe, Theano, Torch, PaddlePaddle+ | + | + | 
+
+| 2015+ | + | TensorFlow, MxNet, Caffe2, ONNX, n-graph+ | + | 
+
+| 2016+ | + | + | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid+ | 
+
+
+
 
 From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
 
diff --git a/doc/fluid/design/motivation/index_cn.rst b/doc/fluid/design/motivation/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7706e73eca644ed6db772fd77da947395313237f
--- /dev/null
+++ b/doc/fluid/design/motivation/index_cn.rst
@@ -0,0 +1,10 @@
+设计动机和目标
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  api.md
+  refactorization.md
+  fluid.md
+  fluid_compiler.md
diff --git a/doc/fluid/design/motivation/index_en.rst b/doc/fluid/design/motivation/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..10b64b257c604ced6b957d6d6018e8a363f00fac
--- /dev/null
+++ b/doc/fluid/design/motivation/index_en.rst
@@ -0,0 +1,10 @@
+Design Motivations and Goals
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  api.md
+  refactorization.md
+  fluid.md
+  fluid_compiler.md
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
index f93d6155e1764386b01d2f0df3f141ab75cd55d4..f199cc892f5e84f0a12abe3b8e5cace9849e7fa8 100644
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -36,11 +36,37 @@ At compile time, the Python program generates a protobuf message representation
 
 At runtime, the C++ program realizes the graph and runs it.
 
-| | Representation (protobuf messages) | Realization (C++ class objects) |
-|---|---|---|
-|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
-|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
-|Block|BlockDesc|Block|
+
+
+
+| + | Representation (protobuf messages)+ | Realization (C++ class objects)+ | 
+
+
+
+| Data+ | +VarDesc+ | +Variable+ | 
+
+| Operation+ | +OpDesc+ | +Operator+ | 
+
+| Block+ | BlockDesc+ | Block+
+ | 
+
+
 
 The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
 
@@ -97,13 +123,13 @@ Compile Time -> IR -> Runtime
 
 ---
 
-# Operator/OpWithKernel/OpKernel
+## Operator/OpWithKernel/OpKernel
 
 
 
 ---
 
-# Operator
+## Operator
 
 
 * `Operator` is the fundamental building block of the user interface.
@@ -113,7 +139,7 @@ Compile Time -> IR -> Runtime
 
 ---
 
-# OpWithKernel/Kernel
+## OpWithKernel/Kernel
 
 
 
@@ -124,7 +150,7 @@ Compile Time -> IR -> Runtime
 
 ---
 
-# Why separate Kernel and Operator
+## Why separate Kernel and Operator
 
 * Separate GPU and CPU code.
     * Make Paddle capable of running without GPU.
@@ -132,7 +158,7 @@ Compile Time -> IR -> Runtime
     * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
 ---
 
-# Libraries for Kernel development
+## Libraries for Kernel development
 
 * `Eigen::Tensor` contains basic math and element-wise functions.
     * Note that `Eigen::Tensor` has broadcast implementation.
@@ -143,16 +169,16 @@ Compile Time -> IR -> Runtime
 * Hand-writing `GPUKernel` and `CPU` code
     * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
 ---
-# Operator Registration
+## Operator Registration
 
-## Why is registration necessary?
+### Why is registration necessary?
 We need a method to build mappings between Op type names and Op classes.
 
-## How is registration implemented?
+### How is registration implemented?
 Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
 
 ---
-# The Registry Map
+## The Registry Map
 
 ### `OpInfoMap`
 
@@ -166,7 +192,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding
 - **`checker`**: Used to check attributes.
 
 ---
-# Related Concepts
+## Related Concepts
 
 ### Op_Maker
 It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
@@ -178,7 +204,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 ```
 
 ---
-# Registration Process
+## Registration Process
 1. Write an Op class and its gradient Op class, if required.
 2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
 3. Invoke the macro `REGISTER_OP`. This macro will
@@ -186,13 +212,13 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
 
 ---
-# Backward Module (1/2)
+## Backward Module (1/2)
 ### Create Backward Operator
 - Mapping from forward Op to backward Op
 
 
 ---
-# Backward Module (2/2)
+## Backward Module (2/2)
 ### Build Backward Network
 - **Input**: a graph of forward operators
 - **Output**: a graph of backward operators
@@ -205,7 +231,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 
 
 ---
-# Scope, Variable, Tensor
+## Scope, Variable, Tensor
 
 * `Tensor` is an n-dimension array with type.
 	* Only dims and data pointers are stored in `Tensor`.
@@ -218,8 +244,8 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
 
 ---
-# Block (in design)
-## the difference between original RNNOp and Block
+## Block (in design)
+### the difference between original RNNOp and Block
 - As an operator is more intuitive than `RNNOp`,
 - Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
 - Fits the compile-time/ runtime separation design paradigm.
@@ -227,7 +253,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
   - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
 
 ---
-# Milestone
+## Milestone
 - Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
 - Model migration
   - Framework development gives **priority support** to model migration, for example,
@@ -240,7 +266,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 - Accept imperfection, concentrate on solving the specific problem at the right price.
 
 ---
-# Control the migration quality
+## Control the migration quality
 - Compare the performance of migrated models with old ones.
 - Follow the google C++ style guide.
 - Build the automatic workflow of generating Python/C++ documentations.
diff --git a/doc/fluid/design/muti_devices/index_cn.rst b/doc/fluid/design/muti_devices/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1f8439e8623e1c1ae9a12c24d08079f0ec3d761f
--- /dev/null
+++ b/doc/fluid/design/muti_devices/index_cn.rst
@@ -0,0 +1,9 @@
+多设备支持
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  operator_kernel_type.md
+  kernel_selection.md
+  kernel_hint_design.md
diff --git a/doc/fluid/design/muti_devices/index_en.rst b/doc/fluid/design/muti_devices/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..819e9c5d77b2abf8da0e2ce6f494ea5174c1d0a2
--- /dev/null
+++ b/doc/fluid/design/muti_devices/index_en.rst
@@ -0,0 +1,9 @@
+Multi-Device Support
+----------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  operator_kernel_type.md
+  kernel_selection.md
+  kernel_hint_design.md
diff --git a/doc/fluid/design/muti_devices/kernel_hint_design.md b/doc/fluid/design/muti_devices/kernel_hint_design.md
index a54b7da045e1a362626ef066f9ebb56af2c3181a..58e44b64169d8c942174de86986403570b271641 100644
--- a/doc/fluid/design/muti_devices/kernel_hint_design.md
+++ b/doc/fluid/design/muti_devices/kernel_hint_design.md
@@ -1,3 +1,5 @@
+# Kernel Hint Design
+
 ## Problem
 In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
 
diff --git a/doc/fluid/design/muti_devices/kernel_selection.md b/doc/fluid/design/muti_devices/kernel_selection.md
index 9719e031c70979cd95400701efd30879662e19bc..967317d5d2eeb818ab14faabca342cc8c4ed717e 100644
--- a/doc/fluid/design/muti_devices/kernel_selection.md
+++ b/doc/fluid/design/muti_devices/kernel_selection.md
@@ -1,3 +1,5 @@
+# Kernel Selection
+
 ## Background
 Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
 
diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md
index af0c6ef36feba9e0239e7a5f81a8dc9108b2471a..f32a5b7e8a4d820319a666dab4c3129360e2c924 100644
--- a/doc/fluid/design/network/deep_speech_2.md
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -1,4 +1,4 @@
-# DeepSpeech2 on PaddlePaddle: Design Doc 
+# DeepSpeech2 on PaddlePaddle: Design Doc
 
 We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
 
@@ -68,11 +68,33 @@ We roughly break down the project into 14 tasks:
 
 Tasks parallelizable within phases:
 
-Roadmap     | Description                               | Parallelizable Tasks 
------------ | :------------------------------------     | :--------------------
-Phase I	    | Simplified model & components             | *Task 1* ~ *Task 8*
-Phase II    | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
-Phase III   | Documentations                            | *Task13* ~ *Task14*
+
+
+
+| Roadmap+ | Description+ | Parallelizable Tasks+ | 
+
+
+
+| Phase I+ | Simplified model & components+ | Task 1 ~ Task 8+ | 
+
+| Phase II+ | Standard model & benchmarking & profiling+ | Task 9 ~ Task 12+ | 
+
+| Phase III+ | Documentations+ | Task13 ~ Task14+ | 
+
+
+
 
 Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
 
@@ -94,7 +116,7 @@ The classical DS2 network contains 15 layers (from bottom to top):
 - **One** CTC-loss layer
 
 
-

+

 Figure 1. Archetecture of Deep Speech 2 Network.
 
+
+
+| Required Components+ | PaddlePaddle Support+ | Need to Develop+ | 
+
+
+
+| Data Layer I (Spectrogram)+ | Not supported yet.+ | TBD (Task 3)+ | 
+
+| Data Layer II (Transcription)+ | paddle.data_type.integer_value_sequence+ | -+ | 
+
+| 2D Convolution Layer+ | paddle.layer.image_conv_layer+ | -+ | 
+
+| DataType Converter (vec2seq)+ | paddle.layer.block_expand+ | -+ | 
+
+| Bi-/Uni-directional RNNs+ | paddle.layer.recurrent_group+ | -+ | 
+
+| Row Convolution Layer+ | Not supported yet.+ | TBD (Task 4)+ | 
+
+| CTC-loss Layer+ | paddle.layer.warp_ctc+ | -+ | 
+
+| Batch Normalization Layer+ | paddle.layer.batch_norm+ | -+ | 
+
+| CTC-Beam search+ | Not supported yet.+ | TBD (Task 6)+ | 
+
+
+
 
 ### Row Convolution
 
@@ -141,18 +208,18 @@ TODO by Assignees
 ### Beam Search with CTC and LM
 
 
-

+

 Figure 2. Algorithm for CTC Beam Search Decoder.
 
-   +
+   
 
 
 According to the image above, the only phase that changes the LoD is beam search.
diff --git a/doc/fluid/design/others/gan_api.md b/doc/fluid/design/others/gan_api.md
index fb41df8615f73d9fd4c32995eab265833eac1a55..7167470088766985fa5ad31657410309330fd725 100644
--- a/doc/fluid/design/others/gan_api.md
+++ b/doc/fluid/design/others/gan_api.md
@@ -1,24 +1,24 @@
 # Design for GAN
 
-GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. 
+GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas.
 
 It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.
 
 In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
 
 
-
+
 Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
 
 
 The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
 
 
-
+
 Figure 2. Photo borrowed from the original DC-GAN paper.
 
 
-## The Conditional-GAN might be a class. 
+## The Conditional-GAN might be a class.
 This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:
 
 - DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
@@ -29,7 +29,7 @@ This design we adopt the popular open source design in https://github.com/carped
 Returns a generated image.
 
 - discriminator(image):
-Given an image, decide if it is from a real source or a fake one. 
+Given an image, decide if it is from a real source or a fake one.
 Returns a 0/1 binary label.
 
 - build_model(self):
@@ -47,7 +47,7 @@ To be more detailed, we introduce our design of DCGAN as following:
 ```python
 class DCGAN(object):
   def __init__(self, y_dim=None):
-  
+
     # hyper parameters  
     self.y_dim = y_dim # conditional gan or not
     self.batch_size = 100
@@ -82,18 +82,18 @@ class DCGAN(object):
     # input z: the random noise
     # input y: input data label (optional)
     # output G_im: generated fake images
-    
+
     if not self.y_dim:
       z = pd.layer.concat(1, [z, y])
-      
+
     G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
     G_h0_bn = pd.layer.batch_norm(G_h0)
     G_h0_relu = pd.layer.relu(G_h0_bn)
-    
+
     G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
     G_h1_bn = pd.layer.batch_norm(G_h1)
     G_h1_relu = pd.layer.relu(G_h1_bn)
-    
+
     G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
     G_im = pd.layer.tanh(G_im)
     return G_im
@@ -111,11 +111,11 @@ class DCGAN(object):
     D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
     D_h0_bn = pd.layer.batchnorm(h0)
     D_h0_relu = pd.layer.lrelu(h0_bn)
-    
+
     D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
     D_h1_bn = pd.layer.batchnorm(D_h1)
     D_h1_relu = pd.layer.lrelu(D_h1_bn)
-    
+
     D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
     return D_h2
 ```
@@ -123,7 +123,7 @@ class DCGAN(object):
 ### Class member function: Build the model
 - Define data readers as placeholders to hold the data;
 - Build generator and discriminators;
-- Define two training losses for discriminator and generator, respectively. 
+- Define two training losses for discriminator and generator, respectively.
 If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
 ```python
 class DCGAN(object):
@@ -133,7 +133,7 @@ class DCGAN(object):
     self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
     self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
     self.z = pd.data(tf.float32, [None, self.z_size])
-    
+
     # step 1: generate images by generator, classify real/fake images with discriminator
     if self.y_dim: # if conditional GAN, includes label
         self.G = self.generator(self.z, self.y)
@@ -147,12 +147,12 @@ class DCGAN(object):
         # generate fake images
         self.sampled = self.sampler(self.z)
         self.D_f = self.discriminator(self.images)
-    
+
     # step 2: define the two losses
     self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
     self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
     self.d_loss = self.d_loss_real + self.d_loss_fake
-    
+
     self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
 ```
 
@@ -176,7 +176,7 @@ class DCGAN(object):
         self.G = self.generator(self.z)
         self.D_g = self.discriminator(self.G, self.y)
       self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
-    
+
     with pd.default_block().d_block():
       if self.y_dim: # if conditional GAN, includes label
         self.D_t = self.discriminator(self.images, self.y)
@@ -217,7 +217,7 @@ if __name__ == "__main__":
 
     # load mnist data
     data_X, data_y = self.load_mnist()
-    
+
     # Two subgraphs required!!!
     with pd.block().d_block():
       d_optim = pd.train.Adam(lr = .001, beta= .1)
@@ -228,7 +228,7 @@ if __name__ == "__main__":
 
     # executor
     sess = pd.executor()
-    
+
     # training
     for epoch in xrange(10000):
       for batch_id in range(N / batch_size):
@@ -239,7 +239,7 @@ if __name__ == "__main__":
         batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])
 
         if batch_id % 2 == 0:
-          sess.run(d_step, 
+          sess.run(d_step,
                    feed_dict = {dcgan.images: batch_im,
                                 dcgan.y: batch_label,
                                 dcgan.z: batch_z})
diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md
index 5596b2653ae6ed9917f77dad08f926bcb1fb3419..b50f18f21df0787b9761bf0935ed7f4384ff0f98 100644
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ b/doc/fluid/dev/api_doc_std_cn.md
@@ -45,11 +45,11 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
 - Python API Definition
 
   - 格式:
-    
+
       [Python API Definition]
-    
+
   - 示例
-  
+
       ```
       fc(input,
          size,
@@ -63,19 +63,19 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
       ```
 
 - Function Description
-  
+
   - 格式
 
       本模块应包含以下内容(排列顺序为文档撰写顺序):
 
       [Function Description]
-  
+
       [Formula]
-    
+
       [Symbols' Descriptions if necessary]
-    
+
       [References if necessary]
- 
+
   - 示例
 
       [Function Description]
@@ -119,18 +119,18 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
       [References if necessary]
 
       因fc没有必要列出的参考文献,故该内容省略。其他情况下需明确给出对应的参考文献和对应连接,以 layer_norm 为例:
-      
+
       ```
       Refer to `Layer Normalization `_ for more details.
       ```
-  
+
 
 - Args Description
-  
+
   - 格式
-  
+
       \[Arg's Name\][(Data Type, Default Value)][Description]
-  
+
   - 示例
 
       fc的部分参数注释如下:
@@ -145,35 +145,35 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
       ```
 
 - Returns
-  
+
   - 格式
-  
+
       [Name][Shape]
-  
+
   - 示例
-  
+
       ```
       Returns:
           A tensor variable storing the transformation result.
       ```
-  
+
       当返回值为包含多个参数的tuple时,应按顺序逐个介绍各参数,以dynamic_lstm为例:
-  
+
       ```
       Returns:
           A tuple containing:
             The hidden state of LSTM whose shape is (T X D).
             The cell state of LSTM whose shape is (T X D).
       ```
-  
+
 - Raises
 
   - 格式
-  
+
       [Exception Type][Condition]
 
   - 示例
-  
+
       ```
       Raises:
           ValueError: If the rank of the input is less than 2.
@@ -182,7 +182,7 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
 - Note
 
   - 格式
-  
+
      [Note]
 
   - 示例
@@ -198,15 +198,15 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
           2. When num_heads == 1, scaled_dot_product_attention has no learnable
              parameters.
       ```
-  
+
 - Examples
 
   - 格式
 
       \[Python Code Snipper]
-  
+
   - 示例
-  
+
       ```
       Examples:
           .. code-block:: python
diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..e57072d52fd162e92a3482aef33f99ab9394c532
--- /dev/null
+++ b/doc/fluid/dev/api_doc_std_en.md
@@ -0,0 +1,226 @@
+# API Doc Standard
+
+- [API Doc Structure](#API Doc Structure)
+- [Format and Examples](#Format and Examples)
+- [Complete Example](#Complete Example)
+
+
+## API Doc Structure
+
+API Doc should contain the following parts(please write them in order):
+
+- Python API Definition
+
+  The definition of API
+
+- Function Description
+
+  Description of API's function. 
+  The description includes: meaning, purpose and operation on input of API, reference and corresponding link(if any), formula(if necessary) and explanations of key variables in the formula.
+
+- Args Description
+
+  Description of API parameters.
+  Introduce parameters one by one according to the order in API definition.
+  The introduction includes: data type, default value(if any), meaning, etc.
+
+- Returns
+
+  Introduction of API returned value.
+  Introduce meaning of returned value, provide correspoding format if necessary.
+  If returned value is a tuple containing multiple parameters, then introduce parameters one by one in order.
+
+- Raises(if any)
+
+   Abnormality, error that may occur, and possible reasons. If there are more than one possible abnormity or error, they should be listed in order. 
+
+- Note(if any)
+
+  Matters needing attention. If there are more than one matters, they should be listed in order. 
+
+- Examples
+
+  Examples of how to use API.
+
+
+## Format and Examples
+
+API documentation must obey reStructuredText format, please refer to [here](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html).
+Format and examples of each part of API documantation are as follows: (take fc for example)
+
+- Python API Definition
+
+  - Format
+
+      [Python API Definition]
+
+  - Example
+
+      ```
+      fc(input,
+         size,
+         num_flatten_dims=1,
+         param_attr=None,
+         bias_attr=None,
+         act=None,
+         name=None,
+         main_program=None,
+         startup_program=None)
+      ```
+
+- Function Description
+
+  - Format
+
+      This part contains (please write them in order):
+
+      [Function Description]
+
+      [Formula]
+
+      [Symbols' Descriptions if necessary]
+
+      [References if necessary]
+
+  - Example
+
+      [Function Description]
+
+       ```
+       **Fully Connected Layer**
+
+       The fully connected layer can take multiple tensors as its inputs. It
+       creates a variable called weights for each input tensor, which represents
+       a fully connected weight matrix from each input unit to each output unit.
+       The fully connected layer multiplies each input tensor with its coresponding
+       weight to produce an output Tensor. If multiple input tensors are given,
+       the results of multiple multiplications will be sumed up. If bias_attr is
+       not None, a bias variable will be created and added to the output. Finally,
+       if activation is not None, it will be applied to the output as well.
+       ```
+
+      [Formula]
+
+      ```
+      This process can be formulated as follows:
+
+      .. math::
+
+           Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+      ```
+
+      [Symbols' Descriptions if necessary]
+
+      ```
+      In the above equation:
+
+      * :math:`N`: Number of the input.
+      * :math:`X_i`: The input tensor.
+      * :math:`W`: The weights created by this layer.
+      * :math:`b`: The bias parameter created by this layer (if needed).
+      * :math:`Act`: The activation function.
+      * :math:`Out`: The output tensor.
+      ```
+
+      [References if necessary]
+
+      Since there is no need for reference of fc, we omit them here. Under other circumstances, please provide explicit reference and link, take layer_norm for example: 
+
+      ```
+      Refer to `Layer Normalization `_ for more details.
+      ```
+
+
+- Args Description
+
+  - Format
+
+      \[Arg's Name\][(Data Type, Default Value)][Description]
+
+  - Example
+
+      part of fc parameters are as follows:
+
+      ```
+      Args:
+          input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+              the input tensor(s) is at least 2.
+          param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+              parameters/weights of this layer.
+          name (str, default None): The name of this layer.
+      ```
+
+- Returns
+
+  - Format
+
+      [Name][Shape]
+
+  - Example
+
+      ```
+      Returns:
+          A tensor variable storing the transformation result.
+      ```
+
+      when returned value contain more than one tuple, please introduce every parameter in order, take dynamic_lstm for example:
+
+      ```
+      Returns:
+          A tuple containing:
+            The hidden state of LSTM whose shape is (T X D).
+            The cell state of LSTM whose shape is (T X D).
+      ```
+
+- Raises
+
+  - Format
+
+      [Exception Type][Condition]
+
+  - Example
+
+      ```
+      Raises:
+          ValueError: If the rank of the input is less than 2.
+      ```
+
+- Note
+
+  - Format
+
+     [Note]
+
+  - Example
+
+      there is no Note in fc, so we omit this part. If there is any note, please write clearly. If there are more than one notes, please list them in order. Take scaled\_dot\_product\_attention for example:
+
+      ```
+      Note:
+          1. When num_heads > 1, three linear projections are learned respectively
+             to map input queries, keys and values into queries', keys' and values'.
+             queries', keys' and values' have the same shapes with queries, keys
+             and values.
+          2. When num_heads == 1, scaled_dot_product_attention has no learnable
+             parameters.
+      ```
+
+- Examples
+
+  - Format
+
+      \[Python Code Snipper]
+
+  - Example
+
+      ```
+      Examples:
+          .. code-block:: python
+
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+      ```
+
+## Complete Example
+
+Complete Example of fc please see [here](src/fc.py)。
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b123b756e2251c38f319e1aefa2cb04fd7a36b03
--- /dev/null
+++ b/doc/fluid/dev/index_cn.rst
@@ -0,0 +1,13 @@
+开发标准
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  new_op_cn.md
+  new_op_kernel.md
+  use_eigen_cn.md
+  name_convention.md
+  support_new_device.md
+  releasing_process_cn.md
+  op_markdown_format.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..98988fc22dcedecdbcd67fb3bf761377bf046337
--- /dev/null
+++ b/doc/fluid/dev/index_en.rst
@@ -0,0 +1,13 @@
+Development
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  new_op_en.md
+  new_op_kernel.md
+  use_eigen_en.md
+  name_convention.md
+  support_new_device.md
+  releasing_process_en.md
+  op_markdown_format.md
diff --git a/doc/fluid/dev/name_convention.md b/doc/fluid/dev/name_convention.md
index a02b356f058da68442516c2705d0bac140f8ef18..75830ef28c67dc4694d899efe503084b7b5852e1 100644
--- a/doc/fluid/dev/name_convention.md
+++ b/doc/fluid/dev/name_convention.md
@@ -1,8 +1,8 @@
-## Operator's Parameter Name Convention
+# Operator's Parameter Name Convention
 
 To make the operator document itself more clear, we recommend operator names obey the listing conventions.
 
-### OpProtoMaker names
+## OpProtoMaker names
 
 When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator.
 
@@ -20,7 +20,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith
 - Order.
   - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
 
-### Best Practice
+## Best Practice
 
 Here we give some examples to show how these rules will be used.
 
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 92996585674b46f45549b972b9f295503b1c7f8c..0c3f88d9c31e05bec399c64bf6ade56e62e01f68 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -26,13 +26,32 @@
 
 依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorWithKernel`,后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下:
 
-
- 内容            | 定义位置
---------------  | :----------------------
-OpProtoMake定义  | `.cc`文件,Backward Op不需要定义OpProtoMake
-Op定义           | `.cc`文件
-Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。
-注册Op           | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中
+
+
+
+| 内容+ | 定义位置+ | 
+
+
+
+| OpProtoMake定义+ | `.cc`文件,Backward Op不需要定义OpProtoMake+ | 
+
+| Op定义+ | `.cc`文件+ | 
+
+| Kernel实现+ | CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。+ | 
+
+| 注册Op+ | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中+ | 
+
+
 
 
 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
index da8b1bdd1082e439456daf25e9b3a1e8eb534375..a566a09131f86251b70d5435d0a483aa2a705b35 100644
--- a/doc/fluid/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
@@ -33,6 +33,33 @@ Op definition           | `.cc` files
 Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
 Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
 
+
+
+
+| Information+ | Where is it defined+ | 
+
+
+
+| OpProtoMake definition+ | `.cc`files, Backward Op does not need an OpProtoMake interface.+ | 
+
+| Op definition+ | `.cc` files+ | 
+
+| Kernel implementation+ | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.+ | 
+
+| Registering the Op+ | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.+ | 
+
+
+
 
 New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
 
@@ -279,7 +306,7 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass
 
       def test_check_output(self):
           self.check_output()
-          
+
       def test_check_grad_normal(self):
           self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
diff --git a/doc/fluid/dev/new_op_kernel_en.md b/doc/fluid/dev/new_op_kernel.md
similarity index 88%
rename from doc/fluid/dev/new_op_kernel_en.md
rename to doc/fluid/dev/new_op_kernel.md
index 123df0a7ee4943c0b789ef9cfa6e0804d0fdd564..55dea8d0a39232ede59d4663d6e1a47fbfc60853 100644
--- a/doc/fluid/dev/new_op_kernel_en.md
+++ b/doc/fluid/dev/new_op_kernel.md
@@ -1,14 +1,14 @@
-## Add Kernels for a New Device
+# Add Kernels for a New Device
 
-### Background
+## Background
 
 PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
 
 [This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
 
-### Write Kernels for A New Device 
+## Write Kernels for A New Device
 
-#### Add A New Device
+### Add A New Device
 
   For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24).  We will correct this ASAP.
 
@@ -23,7 +23,7 @@ enum class LibraryType {
 ```
 
 
-#### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
+### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
 
 If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
 
@@ -45,7 +45,7 @@ struct CUDAPlace {
 typedef boost::variant Place;
 ```
 
-#### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
+### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
 After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
 
 ```cpp
@@ -58,7 +58,7 @@ class DeviceContext {
 };
 ```
 
-#### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
+### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
 
 A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
 
@@ -85,7 +85,7 @@ class OpKernel : public OpKernelBase {
 ```
 
 
-#### Register the OpKernel to framework
+### Register the OpKernel to framework
 
 After writing the components described above, we should register the kernel to the framework.
 
@@ -107,7 +107,7 @@ take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/oper
 	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
     		paddle::operators::GemmConvKernel,
     		paddle::operators::GemmConvKernel);
-    
+
 	REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
 	       paddle::operators::CUDNNConvOpKernel,
 	       paddle::operators::CUDNNConvOpKernel);
diff --git a/doc/fluid/dev/op_markdown_format.md b/doc/fluid/dev/op_markdown_format.md
index 0ee804d592252c727622cbe59b0644813db3c4fd..4e539d7992e5f67ee7b07193b59b6b425b73c9e5 100644
--- a/doc/fluid/dev/op_markdown_format.md
+++ b/doc/fluid/dev/op_markdown_format.md
@@ -15,26 +15,26 @@ The signature of the operator.
 
 Each section mentioned above has been covered in further detail in the rest of the document.
 
-# PaddlePaddle Operator Name
+## PaddlePaddle Operator Name
 This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
 `array to lod tensor` should be written as `array_to_lod_tensor`.
 
 This naming convention should be standard across all PaddlePaddle operators.
 
-# Standard Operator Name
+## Standard Operator Name
 This is the standard name of the operator as used in the community. The general standard is usually:
 - Standard abbreviations like `SGD` are written in all capital letters.
 - Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
 - Keep numbers inside a word as is, with no boundary delimiters.
 - Follow the name of the operator with the keyword: `Activation Operator.`
 
-# Operator description
+## Operator description
 This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
 
-# LaTeX equation
+## LaTeX equation
 This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
 
-# The signature
+## The signature
 This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
 `Section :
 VariableName : (VariableType) VariableDescription
diff --git a/doc/fluid/dev/releasing_process.md b/doc/fluid/dev/releasing_process_cn.md
similarity index 58%
rename from doc/fluid/dev/releasing_process.md
rename to doc/fluid/dev/releasing_process_cn.md
index b9787261092f1f27377886152cb1596d9ff54188..4c6728fba7150b0f1e180e57590f18a5b677c70d 100644
--- a/doc/fluid/dev/releasing_process.md
+++ b/doc/fluid/dev/releasing_process_cn.md
@@ -10,19 +10,10 @@ PaddlePaddle每次发新的版本,遵循以下流程:
   * 使用Regression Test List作为检查列表,测试本次release的正确性。
 	  * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步
 	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
-	* 编译这个版本的python wheel包,并发布到pypi。
-		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。
-		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel`。
-		* 上传方法:
-			```
-			cd build/python
-			pip install twine
-			twine upload dist/[package to upload]
-			```
-		* 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
-1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-1. 协同完成Release Note的书写
-
+	* 将这个版本的python wheel包发布到pypi。
+	* 更新Docker镜像(参考后面的操作细节)。
+1. 第三步完成后,将`release/版本号`分支合入master分支,将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。
+1. 协同完成Release Note的书写。
 
 需要注意的是:
 
@@ -31,13 +22,18 @@ PaddlePaddle每次发新的版本,遵循以下流程:
 
 ## 发布wheel包到pypi
 
-使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
 完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以
-弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。等待编译完成后
-可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m`和`cp27mu`的版本。然后按照上述的方法
-使用`twine`工具上传即可。
-
- +弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。
+
+弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。
+	 +1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m`和`cp27mu`的版本。
+1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+1. 上传:
+```
+cd build/python
+pip install twine
+twine upload dist/[package to upload]
+```
 
 * 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
   发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
@@ -48,10 +44,20 @@ PaddlePaddle每次发新的版本,遵循以下流程:
 上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上
 版本号对应的tag即可:
 
-1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
-1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`,latest tag可以是latest或latest-gpu等。
-1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
-1. 执行 `docker push paddlepaddle/paddle:[version]`
+```
+docker pull [镜像]:latest
+docker tag [镜像]:latest [镜像]:[version]
+docker push [镜像]:[version]
+```
+
+需要更新的镜像tag包括:
+
+* `[version]`: CPU版本
+* `[version]-openblas`: openblas版本
+* `[version]-gpu`: GPU版本(CUDA 8.0 cudnn 5)
+* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像
+
+之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。
 
 ## PaddlePaddle 分支规范
 
@@ -66,7 +72,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 	* 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
 	* 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。
 	* 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。
-		* 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。 
+		* 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。
 
 * BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。
 
@@ -76,15 +82,118 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 
 ### PaddlePaddle Book中所有章节
 
-PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
-
-| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| API.V2 + Docker + GPU  |  |  |  |  |  |  |  |  |
-| API.V2 + Docker + CPU  |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Docker + GPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Docker + CPU |  |  |  |  |  |  |  |  |
-| API.V2 + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-| API.V2 + Ubuntu + CPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Ubuntu + CPU |  |  |  |  |  |  |  |  |
+PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练(V2和Fluid)模型正确性。
+
+
+1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m`和`cp27mu`的版本。
+1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+1. 上传:
+```
+cd build/python
+pip install twine
+twine upload dist/[package to upload]
+```
 
 * 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
   发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
@@ -48,10 +44,20 @@ PaddlePaddle每次发新的版本,遵循以下流程:
 上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上
 版本号对应的tag即可:
 
-1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
-1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`,latest tag可以是latest或latest-gpu等。
-1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
-1. 执行 `docker push paddlepaddle/paddle:[version]`
+```
+docker pull [镜像]:latest
+docker tag [镜像]:latest [镜像]:[version]
+docker push [镜像]:[version]
+```
+
+需要更新的镜像tag包括:
+
+* `[version]`: CPU版本
+* `[version]-openblas`: openblas版本
+* `[version]-gpu`: GPU版本(CUDA 8.0 cudnn 5)
+* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像
+
+之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。
 
 ## PaddlePaddle 分支规范
 
@@ -66,7 +72,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 	* 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
 	* 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。
 	* 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。
-		* 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。 
+		* 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。
 
 * BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。
 
@@ -76,15 +82,118 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 
 ### PaddlePaddle Book中所有章节
 
-PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
-
-| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| API.V2 + Docker + GPU  |  |  |  |  |  |  |  |  |
-| API.V2 + Docker + CPU  |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Docker + GPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Docker + CPU |  |  |  |  |  |  |  |  |
-| API.V2 + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-| API.V2 + Ubuntu + CPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Ubuntu + CPU |  |  |  |  |  |  |  |  |
+PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练(V2和Fluid)模型正确性。
+
+
+
+
+| + | 新手入门章节+ | 识别数字+ | 图像分类+ | 词向量+ | 情感分析+ | 语意角色标注+ | 机器翻译+ | 个性化推荐+ | 
+
+
+
+
+| API.V2 + Docker + GPU+ | + | + | + | + | + | + | + | + | 
+
+
+| API.V2 + Docker + CPU+ | + | + | + | + | + | + | + | + | 
+
+
+| `paddle_trainer` + Docker + GPU+ | + | + | + | + | + | + | + | + | 
+
+
+| `paddle_trainer` + Docker + CPU+ | + | + | + | + | + | + | + | + | 
+
+
+| API.V2 + Ubuntu + GPU+ | + | + | + | + | + | + | + | + | 
+
+
+| API.V2 + Ubuntu + CPU+ | + | + | + | + | + | + | + | + | 
+
+
+| `paddle_trainer` + Ubuntu + GPU+ | + | + | + | + | + | + | + | + | 
+
+
+| `paddle_trainer` + Ubuntu + CPU+ | + | + | + | + | + | + | + | + | 
+
+
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f989b964d6d1a329bbe31adc7ec10db017acaefa
--- /dev/null
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -0,0 +1,210 @@
+# PaddlePaddle Releasing Process
+
+PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
+
+Each time we release a new PaddlePaddle version, we should follow the below steps:
+
+1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
+1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
+   first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
+1. After that, we should do:
+  * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
+      that this release has no major bugs.
+        * If regression test fails, we must fix those bugs and create a new `release/[version]`
+          branch from previous release branch.
+    * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
+    * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
+    * Update the Docker images (see below instructions for detail).
+1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
+   then merge `master` to `develop`.
+1. Update the Release Note.          
+
+***NOTE:***
+
+* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
+  features only for current release, so that we can test on that version.
+* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
+
+## Publish Wheel Packages to pypi
+
+1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+   to build all wheel packages needed to publish. As shown in the following picture, choose a build
+     version, click "..." button on the right side of "Run" button, and switch to the second tab in the
+pop-up box, choose the current release branch and click "Run Build" button. You may repeat this
+     step to start different versions of builds.
+     +1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`.
+1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we
+     upload the package using `twine`, we need to rename the package from `linux_x86_64` to
+     `manylinux1_x86_64`.
+1. Start the upload:
+     ```
+     cd build/python
+     pip install twine
+     twine upload dist/[package to upload]
+     ```
+
+* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can
+  download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using
+    scripts under `tools/manylinux1`.
+* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
+  old version. you must change the version number before upload a new one.
+
+## Publish Docker Images
+
+Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
+
+```
+docker pull [image]:latest
+docker tag [image]:latest [image]:[version]
+docker push [image]:[version]
+```
+
+Tags that need to be updated are:
+* `[version]`: CPU only version image
+* `[version]-openblas`: openblas version image
+* `[version]-gpu`: GPU version(using CUDA 8.0 cudnn 5)
+* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions
+
+You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
+
+## Branching Model
+
+We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
+with some modifications:
+
+* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
+* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
+  regression tests are run.
+* `release/[version]` branch is used to publish each release. Latest release version branches have
+  bugfix only for that version, but no feature updates.
+* Developer forks are not required to follow
+  [git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
+  branching model, all forks is like a feature branch.
+    * Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
+    * Advise: developer use it's fork's develop branch to for new branch to start developing.
+  * Use that branch on developer's fork to create pull requests and start reviews.
+      * developer can push new commits to that branch when the pull request is open.
+* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
+  `master`, `develop` and `releases`.
+
+## PaddlePaddle Regression Test List
+
+### All Chapters of PaddlePaddle Book
+
+We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
+V1 (`paddle_trainer` training) and V2 training and Fluid training.
+
+
+1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`.
+1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we
+     upload the package using `twine`, we need to rename the package from `linux_x86_64` to
+     `manylinux1_x86_64`.
+1. Start the upload:
+     ```
+     cd build/python
+     pip install twine
+     twine upload dist/[package to upload]
+     ```
+
+* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can
+  download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using
+    scripts under `tools/manylinux1`.
+* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
+  old version. you must change the version number before upload a new one.
+
+## Publish Docker Images
+
+Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
+
+```
+docker pull [image]:latest
+docker tag [image]:latest [image]:[version]
+docker push [image]:[version]
+```
+
+Tags that need to be updated are:
+* `[version]`: CPU only version image
+* `[version]-openblas`: openblas version image
+* `[version]-gpu`: GPU version(using CUDA 8.0 cudnn 5)
+* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions
+
+You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
+
+## Branching Model
+
+We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
+with some modifications:
+
+* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
+* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
+  regression tests are run.
+* `release/[version]` branch is used to publish each release. Latest release version branches have
+  bugfix only for that version, but no feature updates.
+* Developer forks are not required to follow
+  [git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
+  branching model, all forks is like a feature branch.
+    * Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
+    * Advise: developer use it's fork's develop branch to for new branch to start developing.
+  * Use that branch on developer's fork to create pull requests and start reviews.
+      * developer can push new commits to that branch when the pull request is open.
+* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
+  `master`, `develop` and `releases`.
+
+## PaddlePaddle Regression Test List
+
+### All Chapters of PaddlePaddle Book
+
+We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
+V1 (`paddle_trainer` training) and V2 training and Fluid training.
+
+
+
+
+| + | Linear Regression+ | Recognize Digits+ | Image Classification+ | Word2Vec+ | Personalized Recommendation+ | Sentiment Analysis+ | Semantic Role Labeling+ | Machine Translation+ | 
+
+
+
+
+| API.V2 + Docker + GPU+ | + | + | + | + | + | + | + | + | 
+
+
+| API.V2 + Docker + CPU+ | + | + | + | + | + | + | + | + | 
+
+
+| `paddle_trainer` + Docker + GPU+ | + | + | + | + | + | + | + | + | 
+
+
+| `paddle_trainer` + Docker + CPU+ | + | + | + | + | + | + | + | + | 
+
+
+| API.V2 + Ubuntu + GPU+ | + | + | + | + | + | + | + | + | 
+
+
+| API.V2 + Ubuntu + CPU+ | + | + | + | + | + | + | + | + | 
+
+
+| `paddle_trainer` + Ubuntu + GPU+ | + | + | + | + | + | + | + | + | 
+
+
+| `paddle_trainer` + Ubuntu + CPU+ | + | + | + | + | + | + | + | + | 
+
+
diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md
index f36843b4408c21bdca1fa83853e5b0a40116791c..75922e7d85a13e53ce94619a48d8da8b960e6c9a 100644
--- a/doc/fluid/dev/use_eigen_cn.md
+++ b/doc/fluid/dev/use_eigen_cn.md
@@ -1,16 +1,16 @@
-## 在Paddle中如何使用Eigen
+# 在Paddle中如何使用Eigen
 
 神经网络本质上是一个计算图,计算需要的数据存放在`Tensor`中,而计算过程是由`Operartor`来描述的。在执行时,`Operator`调用对应`OpKernel`中的`Compute`接口,实现对`Tensor`的操作。
 
 
-### Eigen Tensor模块
+## Eigen Tensor模块
 
 Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。
 
 关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
 
 
-### paddle::framework::Tensor
+## paddle::framework::Tensor
 
 Paddle Tensor定义在framework目录下,其主要接口如下:
 
@@ -20,14 +20,14 @@ class Tensor {
   /*! Return a pointer to mutable memory block. */
   template 
   inline T* data();
-  
+
   /**
    * @brief   Return a pointer to mutable memory block.
    * @note    If not exist, then allocation.
    */
   template 
   inline T* mutable_data(platform::Place place);
-  
+
   /**
    * @brief     Return a pointer to mutable memory block.
    *
@@ -38,17 +38,17 @@ class Tensor {
    */
   template 
   inline T* mutable_data(DDim dims, platform::Place place);
-  
+
   /*! Resize the dimensions of the memory block. */
   inline Tensor& Resize(const DDim& dims);
-  
+
   /*! Return the dimensions of the memory block. */
   inline const DDim& dims() const;
 
  private:  
   /*! holds the memory block if allocated. */
   std::shared_ptr holder_;
-  
+
   /*! points to dimensions of memory block. */
   DDim dim_;
 };
@@ -129,7 +129,7 @@ From是EigenTensor模板提供的一个接口,可以实现从paddle::framework
 
 
 
-### 实现计算
+## 实现计算
 
 当需要完成计算时,我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是,这里的EigenTensor之间的运算只是改变了原有Tensor中的数据,而不会改变原有Tensor的shape信息。
 
diff --git a/doc/fluid/dev/use_eigen_en.md b/doc/fluid/dev/use_eigen_en.md
index 3a466f73d1f9b94a29b171015279c782ca50bd02..3313d097cb21e40c23aa13187b6a50562f12403a 100644
--- a/doc/fluid/dev/use_eigen_en.md
+++ b/doc/fluid/dev/use_eigen_en.md
@@ -1,9 +1,9 @@
-## How to use Eigen in Paddle
+# How to use Eigen in Paddle
 
 Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`.
 
 
-### Eigen Tensor Module
+## Eigen Tensor Module
 
 The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU.
 
@@ -12,7 +12,7 @@ Note that Eigen Tensor is still being actively developed, so its tests are not c
 For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md).
 
 
-### paddle::framework::Tensor
+## paddle::framework::Tensor
 
 Paddle Tensor's is defined in the framework directory with the following interface:
 
@@ -105,7 +105,7 @@ void Compute(const framework::ExecutionContext& context) const override {
 ```
 
 
-### paddle::framework::Tensor到EigenTensor的转换
+## paddle::framework::Tensor到EigenTensor的转换
 
 As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
 
@@ -129,7 +129,7 @@ For more transformations, see the [unit tests](https://github.com/PaddlePaddle/P
 
 
 
-### Implementing Computation
+## Implementing Computation
 
 While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor.
 
diff --git a/doc/fluid/faq/index_cn.rst b/doc/fluid/faq/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..395c1109891b5a00eab6f0b44d855658def7fdd6
--- /dev/null
+++ b/doc/fluid/faq/index_cn.rst
@@ -0,0 +1,2 @@
+FAQ
+------------
diff --git a/doc/fluid/faq/index_en.rst b/doc/fluid/faq/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..395c1109891b5a00eab6f0b44d855658def7fdd6
--- /dev/null
+++ b/doc/fluid/faq/index_en.rst
@@ -0,0 +1,2 @@
+FAQ
+------------
diff --git a/doc/fluid/getstarted/concepts/index_cn.rst b/doc/fluid/getstarted/concepts/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2e7f70fc4cb871a80ffaffec6c06797973cd2f85
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/index_cn.rst
@@ -0,0 +1,4 @@
+基本使用概念
+============
+
+TBD
diff --git a/doc/fluid/getstarted/concepts/index_en.rst b/doc/fluid/getstarted/concepts/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..78cca1e2a3443c2949ca0655190b0f05502f519a
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/index_en.rst
@@ -0,0 +1,4 @@
+Concepts
+============
+
+TBD
diff --git a/doc/fluid/getstarted/concepts/save_model/model_format.md b/doc/fluid/getstarted/concepts/save_model/model_format.md
index e29129fddf775939c9f7a8b49d850d523e6e5a45..1f12ba0497369eacc6a2db7984781b5672f45ea1 100644
--- a/doc/fluid/getstarted/concepts/save_model/model_format.md
+++ b/doc/fluid/getstarted/concepts/save_model/model_format.md
@@ -4,30 +4,70 @@
 
 A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code.
 
-As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. 
+As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters.
 
 ## Implementation
 
-The topology is saved as a plain text in a detailed self-contain protobuf file. 
+The topology is saved as a plain text in a detailed self-contain protobuf file.
 
 The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task.
 
-As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, 
+As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is,
 
 The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
 
-|field name  | type | description |
-| --- | --- | --- |
-| version | uint32_t | Version of saved file. Always 0 now. |
-| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. |
-| tensor desc | void* | TensorDesc protobuf binary message |
-| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` |
-| lod_level | uint64_t | Level of LoD |
-| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. |
-| data of lod[0] | uint64_t*  | [Optional] lod[0].data() |
-| ... | ... | ... |
-
+
+
+
+| field name+ | type+ | description+ | 
+
+
+
+| version+ | uint32_t+ | Version of saved file. Always 0 now.+ | 
 
+
+| tensor desc length+ | uint32_t+ | TensorDesc(Protobuf message) length in bytes.+ | 
+
+| tensor desc+ | void*+ | TensorDesc protobuf binary message+ | 
+
+| tensor data+ | void*+ | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()`+ | 
+
+| lod_level+ | uint64_t+ | Level of LoD+ | 
+
+| length of lod[0]+ | uint64_t+ | [Optional] length of lod[0] in bytes.+ | 
+
+| data of lod[0]+ | uint64_t*+ | [Optional] lod[0].data()+ | 
+
+| ...+ | ...+ | ...+ | 
+
+
 
 ## Summary
 
diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..75af7354be93a6eeabfa9ccf86903505402a7ca6
--- /dev/null
+++ b/doc/fluid/getstarted/index_cn.rst
@@ -0,0 +1,19 @@
+新手入门
+============
+
+
+如果需要快速了解PaddlePaddle的使用,可以参考以下指南。
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_cn.rst
+
+
+在使用PaddlePaddle构建应用时,需要了解一些基本概念。
+这里以一个线性回归为例子,详细介绍了PaddlePaddle的使用流程,包括数据格式,模型配置与训练等。
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/use_concepts_cn.rst
diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..75a43f4af87c34830ec940068196e6ca72640501
--- /dev/null
+++ b/doc/fluid/getstarted/index_en.rst
@@ -0,0 +1,18 @@
+GET STARTED
+============
+
+If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_en.rst
+
+While using PaddlePaddle to build applications, please understand some basic concepts.
+
+Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/index_en.rst
diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..93a9e4e37a8495c553cec257c27363ca8d062d39
--- /dev/null
+++ b/doc/fluid/getstarted/quickstart_cn.rst
@@ -0,0 +1 @@
+../../v2/getstarted/quickstart_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..6e1894faa1176bb9e77f616e07df36191e54b782
--- /dev/null
+++ b/doc/fluid/getstarted/quickstart_en.rst
@@ -0,0 +1 @@
+../../v2/getstarted/quickstart_en.rst
\ No newline at end of file
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..b99b90056b0a2e51f2668a6d27d94857bdc09c37
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -0,0 +1,181 @@
+# Fluid 分布式版本使用指南
+本篇文章将说明如何在PaddlePaddle Fluid版本下进行分布式训练的配置和执行,以及将单机训练脚本改造成支持集群训练的版本
+
+## 准备工作
+* 可用的集群
+
+    包含一个或多个计算节点的集群,每一个节点都能够执行PaddlePaddle的训练任务且拥有唯一的IP地址,集群内的所有计算节点可以通过网络相互通信。
+* 安装PaddlePaddle Fluid with Distribution版本
+
+    所有的计算节点上均需要按照分布式版本的PaddlePaddle, 在用于GPU等设备的机器上还需要额外安装好相应的驱动程序和CUDA的库。
+
+    **注意:**当前对外提供的PaddlePaddle版本并不支持分布式,需要通过源码重新编译。编译和安装方法参见[编译和安装指南](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html)。
+    cmake编译命令中需要将WITH_DISTRIBUTE设置为ON,下面是一个cmake编译指令示例:
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
+## 更新训练脚本
+这里,我们以[Deep Learing 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)课程中的第一章 fit a line 为例,描述如何将单机训练脚本改造成支持集群训练的版本。
+### 单机训练脚本示例
+```python
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+
+我们创建了一个简单的全连接神经网络程序,并且通过Fluid的Executor执行了100次迭代,现在我们需要将该单机版本的程序更新为分布式版本的程序。
+### 介绍Parameter Server
+在非分布式版本的训练脚本中,只存在Trainer一种角色,它不仅处理常规的计算任务,也处理参数相关的计算、保存和优化任务。在分布式版本的训练过程中,由于存在多个Trainer节点进行同样的数据计算任务,因此需要有一个中心化的节点来统一处理参数相关的保存和分配。在PaddlePaddle中,我们称这样的节点为[Parameter Server](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/dist_train/parameter_server.md)
+
+**因此,在分布式的Fluid环境中,我们有两个角色需要创建,分别是Parameter Server和Trainer。**
+
+### 分布式训练
+Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数,将他们分隔为两部分,通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
+```python
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+```
+将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下:
+```python
+... #define the program, cost, and create sgd optimizer
+
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+
+t = fluid.DistributeTranspiler() # create the transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+... #create executor
+
+# in pserver, run this
+#current_endpoint here means current pserver IP:PORT you wish to run on
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
+
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+    for data in train_reader():
+        exe.run(t.get_trainer_program())
+```
+### 分布式训练脚本运行说明
+分布式任务的运行需要将表格中说明的多个参数进行赋值:
+
+
+
+
+| 参数名+ | 值类型+ | 说明+ | 示例+ | 
+
+
+
+| trainer_id+ | int+ | 当前训练节点的ID,训练节点ID编号为0 - n-1, n为trainers的值+ | 0/1/2/3+ | 
+
+| pservers+ | str+ | parameter server 列表+ | 127.0.0.1:6710,127.0.0.1:6711+ | 
+
+| trainers+ | int+ | 训练节点的总个数,>0的数字+ | 4+ | 
+
+| server_endpoint+ | str+ | 当前所起的服务节点的IP:PORT+ | 127.0.0.1:8789+ | 
+
+| training_role+ | str+ | 节点角色, TRAINER/PSERVER+ | PSERVER+ | 
+
+
+
+
+**注意:** ```training_role```是用来区分当前所起服务的角色的,用于训练程序中,用户可根据需要自行定义,其他参数为fluid.DistributeTranspiler的transpile函数所需要,需要在调用函数前进行定义,样例如下:
+
+```python
+t = fluid.DistributeTranspiler()
+t.transpile(
+    optimize_ops,
+    params_grads,
+    trainer_id,
+    pservers=pserver,
+    trainers=trainers)
+if training_role == "PSERVER":
+    pserver_prog = t.get_pserver_program(server_endpoint)
+    pserver_startup = t.get_startup_program(server_endpoint, pserver_prog)
+```
+
+### Demo
+完整的demo代码位于Fluid的test目录下的[book](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py)中。
+
+第一步,进入demo代码所在目录:
+```bash
+cd /paddle/python/paddle/fluid/tests/book
+```
+
+第二步,启动Parameter Server:
+```bash
+PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
+```
+执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
+
+第三步,启动Trainer:
+```bash
+PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
+```
+由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。
+
+现在我们就启动了一个包含一个Parameter Server和两个Trainer的分布式训练任务。
diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..97aeaf167d329529f2b120b5a3d4085e0510fe16
--- /dev/null
+++ b/doc/fluid/howto/index_cn.rst
@@ -0,0 +1,7 @@
+进阶使用
+------------
+
+.. toctree::
+  :maxdepth: 1
+  
+  optimization/index_cn.rst
diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fd21e167ce3a46da167db1e9d7013804f730e047
--- /dev/null
+++ b/doc/fluid/howto/index_en.rst
@@ -0,0 +1,7 @@
+HOW TO
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  optimization/index_en.rst
diff --git a/doc/fluid/howto/optimization/benchmark/README.md b/doc/fluid/howto/optimization/benchmark/README.md
new file mode 120000
index 0000000000000000000000000000000000000000..db30af7f53231c687f9ad61ad961a685733cbad0
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/README.md
@@ -0,0 +1 @@
+../../../../../benchmark/cluster/README.md
\ No newline at end of file
diff --git a/doc/fluid/howto/optimization/benchmark/index_cn.rst b/doc/fluid/howto/optimization/benchmark/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9404800eb86ca6d27886258b67393028c76954dc
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/index_cn.rst
@@ -0,0 +1,8 @@
+基准
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  vgg16/README.md
+  README.md
diff --git a/doc/fluid/howto/optimization/benchmark/index_en.rst b/doc/fluid/howto/optimization/benchmark/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1e200b660cc7f6aeaf8b3d94fd7a14999a52bccd
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/index_en.rst
@@ -0,0 +1,8 @@
+Benchmark
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  vgg16/README.md
+  README.md
diff --git a/doc/fluid/howto/optimization/benchmark/vgg16/README.md b/doc/fluid/howto/optimization/benchmark/vgg16/README.md
new file mode 120000
index 0000000000000000000000000000000000000000..ca963ef5f06aa0c2fe507ba7548dca8017358120
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/vgg16/README.md
@@ -0,0 +1 @@
+../../../../../../benchmark/cluster/vgg16/README.md
\ No newline at end of file
diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md
index d59be670c2b33b64d9b6f96b53f50e5bf9f0613b..8266dec3c6125a09b90ac0ccd4aa5464f5c7db31 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -8,7 +8,7 @@ PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大
 * Python 与 C++ 混合代码的性能分析
 
 
-## Python代码的性能分析
+# Python代码的性能分析
 
 ### 生成性能分析文件
 
@@ -42,14 +42,40 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 
 每一列的含义是:
 
-| 列名 | 含义 |
-| --- | --- |
-| ncalls | 函数的调用次数 |
-| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
-| percall | tottime的每次调用平均时间 |
-| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
-| percall | cumtime的每次调用平均时间 |
-| filename:lineno(function) | 文件名, 行号,函数名 |
+
+
+
+| 列名+ | 含义+ | 
+
+
+
+| ncalls+ | 函数的调用次数+ | 
+
+| tottime+ | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间+ | 
+
+| percall+ | tottime的每次调用平均时间+ | 
+
+| cumtime+ | 函数总时间。包含这个函数调用其他函数的时间+ | 
+
+| percall+ | cumtime的每次调用平均时间+ | 
+
+| filename:lineno(function)+ | 文件名, 行号,函数名+ | 
+
+
 
 
 ### 寻找性能瓶颈
diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
index 01e5fddf61547f9fc86ef18a6f2e2ac508d22dbb..e95556dd608b7ff0a3eb18873df0015a2da94e7c 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -14,7 +14,7 @@ the profiling and tuning of
 1. the Python code and
 1. the mixture of Python and C++ code.
 
-## Profiling the Python Code
+# Profiling the Python Code
 
 ### Generate the Performance Profiling File
 
@@ -57,14 +57,40 @@ port, we will see the output like the following:
 where each line corresponds to Python function, and the meaning of
 each column is as follows:
 
-| column | meaning |
-| --- | --- |
-| ncalls | the number of calls into a function |
-| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
-| percall | tottime divided by ncalls |
-| cumtime | the total execution time of the function, including the execution time of other functions being called |
-| percall | cumtime divided by ncalls |
-| filename:lineno(function) | where the function is defined |
+
+
+
+| column+ | meaning+ | 
+
+
+
+| ncalls+ | the number of calls into a function+ | 
+
+| tottime+ | the total execution time of the function, not including the execution time of other functions called by the function+ | 
+
+| percall+ | tottime divided by ncalls+ | 
+
+| cumtime+ | the total execution time of the function, including the execution time of other functions being called+ | 
+
+| percall+ | cumtime divided by ncalls+ | 
+
+| filename:lineno(function)+ | where the function is define+ | 
+
+
 
 ### Identify Performance Bottlenecks
 
@@ -81,7 +107,7 @@ focus on. We can sort above profiling file by tottime:
 
 We can see that the most time-consuming function is the `built-in
 method run`, which is a C++ function in `libpaddle.so`.  We will
-explain how to profile C++ code in the next section.  At this 
+explain how to profile C++ code in the next section.  At this
 moment, let's look into the third function `sync_with_cpp`, which is a
 Python function.  We can click it to understand more about it:
 
diff --git a/doc/fluid/howto/optimization/index_cn.rst b/doc/fluid/howto/optimization/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..27cc96702356703b339db845dc81913bdcc9f23b
--- /dev/null
+++ b/doc/fluid/howto/optimization/index_cn.rst
@@ -0,0 +1,9 @@
+性能优化
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  timeline.md
+  cpu_profiling_cn.md
+  benchmark/index_cn.rst
diff --git a/doc/fluid/howto/optimization/index_en.rst b/doc/fluid/howto/optimization/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4ce624fe8f108a6afc7cd08a1542332755d22e04
--- /dev/null
+++ b/doc/fluid/howto/optimization/index_en.rst
@@ -0,0 +1,9 @@
+Performance Optimization
+---------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  timeline.md
+  cpu_profiling_en.md
+  benchmark/index_en.rst
diff --git a/doc/fluid/howto/optimization/timeline.md b/doc/fluid/howto/optimization/timeline.md
index 9d9565a3e698a83ca465c5da83ff892360c33b8f..96481ae2a6e4442d40803f8d5361e5f942502df3 100644
--- a/doc/fluid/howto/optimization/timeline.md
+++ b/doc/fluid/howto/optimization/timeline.md
@@ -1,4 +1,4 @@
-## how to use timeline tool to do profile
+# how to use timeline tool to do profile
 
 1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
 
diff --git a/doc/fluid/howto/performance/profiler.md b/doc/fluid/howto/performance/profiler.md
index b20b5efdc1f1f10ce7cec835adcc6fb374ed4e20..ee96e7c74ce317caddb387cbb1d4998937bd5c81 100644
--- a/doc/fluid/howto/performance/profiler.md
+++ b/doc/fluid/howto/performance/profiler.md
@@ -23,7 +23,7 @@ But how to record the time for the mixed C++ and CUDA program?  There many C++ A
 
 The overall flow is shown as the following figure.
 
-
+
 
 ### Event
 
@@ -36,10 +36,10 @@ enum EventKind {
   kPopRange};
 ```
 - kMark: only a marker without time range.
-- kPushRange: mark the starting event for time range. 
+- kPushRange: mark the starting event for time range.
 - kPopRange: mark the ending event for time range.
 
-For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used.  For many pieces of code, an event lists are used to record each piece. 
+For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used.  For many pieces of code, an event lists are used to record each piece.
 
 ```c++
 class Event {
@@ -66,11 +66,11 @@ struct EventList {
 };
 ```
 
-As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler. 
+As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler.
 
 ```c++
 enum ProfilerState {
-  kDisabled, 
+  kDisabled,
   kCPU,
   kCUDA
 };
diff --git a/doc/fluid/images/2_level_rnn.dot b/doc/fluid/images/2_level_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..5d77865061ca7bbbfcf254dd938f09aef5553505
--- /dev/null
+++ b/doc/fluid/images/2_level_rnn.dot
@@ -0,0 +1,56 @@
+digraph G {
+
+  rnn [label="1st level RNN" shape=box]
+
+  subgraph cluster0 {
+    label = "time step 0"
+
+    sent0 [label="sentence"]
+    sent1 [label="sentence"]
+
+    rnn1 [label="2nd level RNN" shape=box]
+
+    sent0 -> rnn1
+    sent1 -> rnn1
+  }
+
+  subgraph cluster1 {
+    label = "time step 1"
+
+    sent2 [label="sentence"]
+    sent3 [label="sentence"]
+
+    rnn2 [label="2nd level RNN" shape=box]
+
+    sent2 -> rnn2
+    sent3 -> rnn2
+  }
+
+  subgraph cluster2 {
+    label = "time step 2"
+
+    sent4 [label="sentence"]
+    sent5 [label="sentence"]
+
+    rnn3 [label="2nd level RNN" shape=box]
+
+    sent4 -> rnn3
+    sent5 -> rnn3
+  }
+
+
+  para0 [label="paragraph info 0"]
+  para1 [label="paragraph info 1"]
+  para2 [label="paragraph info 2"]
+
+  rnn1 -> para0
+  rnn2 -> para1
+  rnn3 -> para2
+
+  para0 -> rnn
+  para1 -> rnn
+  para2 -> rnn
+
+  chapter [label="chapter info"]
+  rnn -> chapter
+}
diff --git a/doc/fluid/images/2_level_rnn.png b/doc/fluid/images/2_level_rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..0537a75beb175c0c284717421f7aa908da2a5038
Binary files /dev/null and b/doc/fluid/images/2_level_rnn.png differ
diff --git a/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455
Binary files /dev/null and b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/fluid/images/asgd.gif b/doc/fluid/images/asgd.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4a0da7bf6df9326a2aab1638b77c5455c18b8c4e
Binary files /dev/null and b/doc/fluid/images/asgd.gif differ
diff --git a/doc/fluid/images/batch_norm_fork.dot b/doc/fluid/images/batch_norm_fork.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4bc47713cba2cb23f1b34fffe6426ef10ac3a9df
--- /dev/null
+++ b/doc/fluid/images/batch_norm_fork.dot
@@ -0,0 +1,25 @@
+digraph ImageBatchNormForkGragh {
+  subgraph cluster_before {
+    Prev [label="...", shape=plaintext];
+    Rnn [label="rnn_op", shape=box];
+    BatchNorm [label="batch_norm_op", shape=box];
+    Fc [label="fc_op", shape=box];
+    After [label="...", shape=plaintext];
+    Prev -> Rnn -> BatchNorm -> Fc -> After;
+    label="original";
+  }
+
+  subgraph cluster_after {
+    Prev2 [label="...", shape=plaintext];
+    Rnn2 [label="rnn_op", shape=box];
+    BatchNorm2_1 [label="train_batch_norm_op", shape=box];
+    BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
+    Fc2_1 [label="fc_op", shape=box];
+    Fc2_2 [label="fc_op", shape=box];
+    After2_1 [label="...", shape=plaintext];
+    After2_2 [label="...", shape=plaintext];
+    Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
+    Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
+    label="forked";
+  }
+}
diff --git a/doc/fluid/images/batch_norm_fork.png b/doc/fluid/images/batch_norm_fork.png
new file mode 100644
index 0000000000000000000000000000000000000000..aded62bce5bc268b7a3ef4dc96c89fe21d6ea955
Binary files /dev/null and b/doc/fluid/images/batch_norm_fork.png differ
diff --git a/doc/fluid/images/batch_norm_op_kernel.png b/doc/fluid/images/batch_norm_op_kernel.png
new file mode 100644
index 0000000000000000000000000000000000000000..a99ce81ff3bf42880ebbd6a1297de3bf038e09b2
Binary files /dev/null and b/doc/fluid/images/batch_norm_op_kernel.png differ
diff --git a/doc/fluid/images/beam_search.png b/doc/fluid/images/beam_search.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae
Binary files /dev/null and b/doc/fluid/images/beam_search.png differ
diff --git a/doc/fluid/images/ci_build_whl.png b/doc/fluid/images/ci_build_whl.png
new file mode 100644
index 0000000000000000000000000000000000000000..232762b82a9ae3e979a1f38a7beb715c87438f40
Binary files /dev/null and b/doc/fluid/images/ci_build_whl.png differ
diff --git a/doc/fluid/images/compiler.graffle b/doc/fluid/images/compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..8cc678fea3c820103e7ce81f7a5d625d6c1d92de
Binary files /dev/null and b/doc/fluid/images/compiler.graffle differ
diff --git a/doc/fluid/images/compiler.png b/doc/fluid/images/compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..65d34f841afce9756def07dd8ecb9ca44e658bfe
Binary files /dev/null and b/doc/fluid/images/compiler.png differ
diff --git a/doc/fluid/images/control_flow_graph.png b/doc/fluid/images/control_flow_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3579998e58d07abc50bd3332128d4733a391cb3b
Binary files /dev/null and b/doc/fluid/images/control_flow_graph.png differ
diff --git a/doc/fluid/images/dataflow_equations.png b/doc/fluid/images/dataflow_equations.png
new file mode 100644
index 0000000000000000000000000000000000000000..c10f7f69f4007952e5b0394edaa04efa1cfbb658
Binary files /dev/null and b/doc/fluid/images/dataflow_equations.png differ
diff --git a/doc/fluid/images/dcgan.png b/doc/fluid/images/dcgan.png
new file mode 100644
index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28
Binary files /dev/null and b/doc/fluid/images/dcgan.png differ
diff --git a/doc/fluid/images/deep_learning.png b/doc/fluid/images/deep_learning.png
new file mode 100644
index 0000000000000000000000000000000000000000..026becc4d94e01e407dacb2a5314a0e5723334ff
Binary files /dev/null and b/doc/fluid/images/deep_learning.png differ
diff --git a/doc/fluid/images/dist-graph.graffle b/doc/fluid/images/dist-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..941399c6ced8d5f65b6c595522b770c88259df4b
Binary files /dev/null and b/doc/fluid/images/dist-graph.graffle differ
diff --git a/doc/fluid/images/dist-graph.png b/doc/fluid/images/dist-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3546b09f1c2ee3e4f60f519d5e47f823f08051a7
Binary files /dev/null and b/doc/fluid/images/dist-graph.png differ
diff --git a/doc/fluid/images/distributed_architecture.graffle b/doc/fluid/images/distributed_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d1b60141342232e06227c2d430ebc60ec349a907
Binary files /dev/null and b/doc/fluid/images/distributed_architecture.graffle differ
diff --git a/doc/fluid/images/distributed_architecture.png b/doc/fluid/images/distributed_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356
Binary files /dev/null and b/doc/fluid/images/distributed_architecture.png differ
diff --git a/doc/fluid/images/ds2_network.png b/doc/fluid/images/ds2_network.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11
Binary files /dev/null and b/doc/fluid/images/ds2_network.png differ
diff --git a/doc/fluid/images/feed_forward.png b/doc/fluid/images/feed_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..d312371a04c26aa6cd196e0bd1f51becb425180b
Binary files /dev/null and b/doc/fluid/images/feed_forward.png differ
diff --git a/doc/fluid/images/feed_forward_regularized.png b/doc/fluid/images/feed_forward_regularized.png
new file mode 100644
index 0000000000000000000000000000000000000000..677e99bfd9f8e72ed9fe4b27127af2ced202f447
Binary files /dev/null and b/doc/fluid/images/feed_forward_regularized.png differ
diff --git a/doc/fluid/images/fluid-compiler.graffle b/doc/fluid/images/fluid-compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c933df2cb855462c52b2d25f7f9a99b95652961d
Binary files /dev/null and b/doc/fluid/images/fluid-compiler.graffle differ
diff --git a/doc/fluid/images/fluid-compiler.png b/doc/fluid/images/fluid-compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b0ffed2039c91a3a00bbb719da08c91c3acf7bb
Binary files /dev/null and b/doc/fluid/images/fluid-compiler.png differ
diff --git a/doc/fluid/images/graph_construction_example.bash b/doc/fluid/images/graph_construction_example.bash
new file mode 100755
index 0000000000000000000000000000000000000000..35e6997abd17588e17a82d448918fc1b3bd7220e
--- /dev/null
+++ b/doc/fluid/images/graph_construction_example.bash
@@ -0,0 +1,11 @@
+cat ./graph_construction_example.dot | \
+    sed 's/color=red/color=red, style=invis/g' | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_only.png
+
+cat ./graph_construction_example.dot | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_backward.png
+
+cat ./graph_construction_example.dot | \
+    dot -Tpng > graph_construction_example_all.png
diff --git a/doc/fluid/images/graph_construction_example.dot b/doc/fluid/images/graph_construction_example.dot
new file mode 100644
index 0000000000000000000000000000000000000000..e115f9844bae6ad24f638c8ed4749cea8aff06a9
--- /dev/null
+++ b/doc/fluid/images/graph_construction_example.dot
@@ -0,0 +1,68 @@
+digraph ImageClassificationGraph {
+        ///////// The forward part /////////
+        FeedX [label="Feed", color=blue, shape=box];
+        FeedY [label="Feed", color=blue, shape=box];
+        InitW [label="Init", color=blue, shape=diamond];
+        Initb [label="Init", color=blue, shape=diamond];
+        FC [label="FC", color=blue, shape=box];
+        MSE [label="MSE", color=blue, shape=box];
+
+        x [label="x", color=blue, shape=oval];
+        l [label="l", color=blue, shape=oval];
+        y [label="y", color=blue, shape=oval];
+        W [label="W", color=blue, shape=doublecircle];
+        b [label="b", color=blue, shape=doublecircle];
+        cost [label="cost", color=blue, shape=oval];
+
+        FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
+        FeedY -> l [color=blue];
+        InitW -> W [color=blue];
+        Initb -> b [color=blue];
+        W -> FC [color=blue];
+        b -> FC [color=blue];
+        l -> MSE [color=blue];
+
+        ////////// The backward part /////////
+        MSE_Grad [label="MSE_grad", color=red, shape=box];
+        FC_Grad [label="FC_grad", color=red, shape=box];
+
+        d_cost [label="d cost", color=red, shape=oval];
+        d_y [label="d y", color=red, shape=oval];
+        d_b [label="d b", color=red, shape=oval];
+        d_W [label="d W", color=red, shape=oval];
+
+        cost -> MSE_Grad [color=red];
+        d_cost -> MSE_Grad [color=red];
+        l -> MSE_Grad [color=red];
+        y -> MSE_Grad -> d_y [color=red];
+
+        x -> FC_Grad [color=red];
+        y -> FC_Grad [color=red];
+        d_y -> FC_Grad [color=red];
+        W -> FC_Grad -> d_W [color=red];
+        b -> FC_Grad -> d_b [color=red];
+
+        ////////// The optimizaiton part //////////
+
+        OPT_W [label="SGD", color=green, shape=box];
+        OPT_b [label="SGD", color=green, shape=box];
+
+        W -> OPT_W [color=green];
+        b -> OPT_b [color=green];
+        d_W -> OPT_W -> W [color=green];
+        d_b -> OPT_b -> b [color=green];
+
+        ////////// Groupings //////////
+
+        subgraph clusterMSE {
+                style=invis;
+                MSE;
+                MSE_Grad;
+        }
+
+        subgraph clusterFC {
+                style=invis;
+                FC;
+                FC_Grad;
+        }
+}
diff --git a/doc/fluid/images/graph_construction_example_all.png b/doc/fluid/images/graph_construction_example_all.png
new file mode 100644
index 0000000000000000000000000000000000000000..261611a5721f9aa97874f7e6d897fe48cf667db2
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_all.png differ
diff --git a/doc/fluid/images/graph_construction_example_forward_backward.png b/doc/fluid/images/graph_construction_example_forward_backward.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c69687f4a6a181138f3df72ce5e8aa48487b5be
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/fluid/images/graph_construction_example_forward_only.png b/doc/fluid/images/graph_construction_example_forward_only.png
new file mode 100644
index 0000000000000000000000000000000000000000..e668c16e0cac73acb4e5dc2b1827557ae77126b4
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_forward_only.png differ
diff --git a/doc/fluid/images/l1_regularization.png b/doc/fluid/images/l1_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1b9c7a44f94dc027598a98da93ddb8133190972
Binary files /dev/null and b/doc/fluid/images/l1_regularization.png differ
diff --git a/doc/fluid/images/l2_regularization.png b/doc/fluid/images/l2_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..d5c2fcbc2ccae75ad083162e5a2dceb0210be298
Binary files /dev/null and b/doc/fluid/images/l2_regularization.png differ
diff --git a/doc/fluid/images/local-graph.graffle b/doc/fluid/images/local-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..19e509bd9af3c1e9a3f5e0f16ddd281457a339c5
Binary files /dev/null and b/doc/fluid/images/local-graph.graffle differ
diff --git a/doc/fluid/images/local-graph.png b/doc/fluid/images/local-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..ada51200f793a9bb18911e7d63cfdb3244b967d7
Binary files /dev/null and b/doc/fluid/images/local-graph.png differ
diff --git a/doc/fluid/images/local_architecture.graffle b/doc/fluid/images/local_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..49fcc663ebe3824aa234e3a67aadf285cb417877
Binary files /dev/null and b/doc/fluid/images/local_architecture.graffle differ
diff --git a/doc/fluid/images/local_architecture.png b/doc/fluid/images/local_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122
Binary files /dev/null and b/doc/fluid/images/local_architecture.png differ
diff --git a/doc/fluid/images/lookup_table.png b/doc/fluid/images/lookup_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..72dfe3547f731d0d090338afb206b0549dff472e
Binary files /dev/null and b/doc/fluid/images/lookup_table.png differ
diff --git a/doc/fluid/images/lookup_table_training.png b/doc/fluid/images/lookup_table_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e
Binary files /dev/null and b/doc/fluid/images/lookup_table_training.png differ
diff --git a/doc/fluid/images/loss_equation.png b/doc/fluid/images/loss_equation.png
new file mode 100644
index 0000000000000000000000000000000000000000..14212ec8d36c803de96bde8a9a4b5591bd20434e
Binary files /dev/null and b/doc/fluid/images/loss_equation.png differ
diff --git a/doc/fluid/images/multi-threads.graffle b/doc/fluid/images/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/fluid/images/multi-threads.graffle differ
diff --git a/doc/fluid/images/multi-threads@3x.png b/doc/fluid/images/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/fluid/images/multi-threads@3x.png differ
diff --git a/doc/fluid/images/multigpu_allreduce.graffle b/doc/fluid/images/multigpu_allreduce.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cb5bc420ceafe8ba4c87694d44ee4e5e4ad06779
Binary files /dev/null and b/doc/fluid/images/multigpu_allreduce.graffle differ
diff --git a/doc/fluid/images/multigpu_allreduce.png b/doc/fluid/images/multigpu_allreduce.png
new file mode 100644
index 0000000000000000000000000000000000000000..87a1b3e8f6dd4a713ec9df9f0037d1da04e9178a
Binary files /dev/null and b/doc/fluid/images/multigpu_allreduce.png differ
diff --git a/doc/fluid/images/multigpu_before_convert.graffle b/doc/fluid/images/multigpu_before_convert.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..6c35ab1b21fb76ceae82d3693ed0d085b5bc0855
Binary files /dev/null and b/doc/fluid/images/multigpu_before_convert.graffle differ
diff --git a/doc/fluid/images/multigpu_before_convert.png b/doc/fluid/images/multigpu_before_convert.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c8f7711165d80a2fa3911280fdee91855a401b1
Binary files /dev/null and b/doc/fluid/images/multigpu_before_convert.png differ
diff --git a/doc/fluid/images/multiple_reader.png b/doc/fluid/images/multiple_reader.png
new file mode 100644
index 0000000000000000000000000000000000000000..b22126b31db4982c13fc3a0827805e6aaf955046
Binary files /dev/null and b/doc/fluid/images/multiple_reader.png differ
diff --git a/doc/fluid/images/paddle-compile.graffle b/doc/fluid/images/paddle-compile.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..a6348cc3dbcaca923c6e794681b2edb85cb9f8f6
Binary files /dev/null and b/doc/fluid/images/paddle-compile.graffle differ
diff --git a/doc/fluid/images/paddle-compile.png b/doc/fluid/images/paddle-compile.png
new file mode 100644
index 0000000000000000000000000000000000000000..e0f13d551ac41afaec627a57dea79356464bf0bf
Binary files /dev/null and b/doc/fluid/images/paddle-compile.png differ
diff --git a/doc/fluid/images/pprof_1.png b/doc/fluid/images/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/fluid/images/pprof_1.png differ
diff --git a/doc/fluid/images/pprof_2.png b/doc/fluid/images/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/fluid/images/pprof_2.png differ
diff --git a/doc/fluid/images/profiler.png b/doc/fluid/images/profiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1
Binary files /dev/null and b/doc/fluid/images/profiler.png differ
diff --git a/doc/fluid/images/readers.png b/doc/fluid/images/readers.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd59168ce16c9e2a0ef45303c28c997cfd7740be
Binary files /dev/null and b/doc/fluid/images/readers.png differ
diff --git a/doc/fluid/images/remote_executor.graffle b/doc/fluid/images/remote_executor.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..41b2067311694b56d211a4f32d1b76884eeffd2d
Binary files /dev/null and b/doc/fluid/images/remote_executor.graffle differ
diff --git a/doc/fluid/images/remote_executor.png b/doc/fluid/images/remote_executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..744e2fb2e0f1bbe058e991ba7b2a09000965ee79
Binary files /dev/null and b/doc/fluid/images/remote_executor.png differ
diff --git a/doc/fluid/images/rnn.dot b/doc/fluid/images/rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5
--- /dev/null
+++ b/doc/fluid/images/rnn.dot
@@ -0,0 +1,87 @@
+digraph G {
+  label = "simple RNN implementation" 
+
+  ranksep=2;
+
+  //graph [nodesep=1, ranksep=1];
+
+  node[nodesep=1]
+
+  subgraph cluster0 {
+    label = "global scope"
+    rankdir = TB
+    W
+    boot_memory
+    input
+    output
+  }
+
+  subgraph cluster1 {
+    label = "step-scope 0"
+    rankdir = TB
+    memory0[label="memory"]
+    prememory0[label="pre-memory"]
+    step_input0[label="step input"]
+    step_output0[label="step output"]
+  }
+
+  subgraph cluster2 {
+    label = "step-scope 1"
+    rankdir = TB
+    memory1[label="memory"]
+    prememory1[label="pre-memory"]
+    step_input1[label="step input"]
+    step_output1[label="step output"]
+  }
+
+  subgraph cluster3 {
+    label = "step-scope 2"
+    rankdir = TB
+    memory2[label="memory"]
+    prememory2[label="pre-memory"]
+    step_input2[label="step input"]
+    step_output2[label="step output"]
+  }
+
+  stepnet [shape=box]
+  stepnet0 [shape=box, style=dashed]
+  stepnet1 [shape=box, style=dashed]
+  stepnet2 [shape=box, style=dashed]
+
+
+  edge[color=blue]
+  boot_memory -> prememory0 [label="init" color="blue"]
+  memory0 -> prememory1  [label="copy/reference" color="blue"]
+  memory1 -> prememory2 [label="copy/reference" color="blue"]
+
+  edge[color=black]
+  W -> stepnet0[constraint=false, style=dashed]
+  W -> stepnet1[constraint=false, style=dashed]
+  W -> stepnet2[constraint=false, style=dashed]
+
+  memory0 -> stepnet0[style=dashed]
+  prememory0 -> stepnet0 -> step_output0[style=dashed]
+
+  memory1 -> stepnet1[style=dashed]
+  prememory1 -> stepnet1 -> step_output1[style=dashed]
+
+  memory2 -> stepnet2[style=dashed]
+  prememory2 -> stepnet2 -> step_output2[style=dashed]
+
+  input -> step_input0
+  input -> step_input1
+  input -> step_input2
+
+  step_input0 -> stepnet0 [style=dashed]
+  step_input1 -> stepnet1[style=dashed]
+  step_input2 -> stepnet2[style=dashed]
+
+  step_output0 -> output
+  step_output1 -> output
+  step_output2 -> output
+
+  stepnet0 -> stepnet[style=dashed]
+  stepnet1 -> stepnet[style=dashed]
+  stepnet2 -> stepnet[style=dashed]
+
+}
diff --git a/doc/fluid/images/rnn.jpg b/doc/fluid/images/rnn.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9867e404cf959df0dce6ded5222b466c788fb840
Binary files /dev/null and b/doc/fluid/images/rnn.jpg differ
diff --git a/doc/fluid/images/rnn.png b/doc/fluid/images/rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..e139e373fe8396782044cfd936fdde624f8c66fe
Binary files /dev/null and b/doc/fluid/images/rnn.png differ
diff --git a/doc/fluid/images/rnn_2level_data.dot b/doc/fluid/images/rnn_2level_data.dot
new file mode 100644
index 0000000000000000000000000000000000000000..1d85ae2617a915ad0ad8288d848b607cc37ad297
--- /dev/null
+++ b/doc/fluid/images/rnn_2level_data.dot
@@ -0,0 +1,75 @@
+digraph G {
+  chapter [label="chapter"]
+
+  subgraph cluster0 {
+    label = "paragraph 0"
+
+    top_rnn0[label="top rnn step 0" shape=box]
+
+    p0 [label="paragraph 0"]
+    p1 [label="paragraph 1"]
+  }
+
+  subgraph cluster1{
+    label = "paragraph 1"
+
+    top_rnn1[label="top rnn step 1" shape=box]
+
+    p2 [label="paragraph 0"]
+    p3 [label="paragraph 1"]
+  }
+
+  subgraph cluster_p0 {
+    label = "sentence 0"
+
+    low_rnn0 [label="low rnn step 0" shape=box]
+    s00 [label="sentence 0"]
+    s01 [label="sentence 1"]
+
+    low_rnn0 -> s00
+    low_rnn0 -> s01
+  }
+
+  subgraph cluster_p1 {
+    label = "sentence 1"
+    low_rnn1 [label="low rnn step 1" shape=box]
+    s10 [label="sentence 0"]
+    s11 [label="sentence 1"]
+    low_rnn1 -> s10
+    low_rnn1 -> s11
+  }
+
+  subgraph cluster_p2 {
+    label = "sentence 1"
+    low_rnn2 [label="low rnn step 0" shape=box]
+    s20 [label="sentence 0"]
+    s21 [label="sentence 1"]
+    low_rnn2 -> s20
+    low_rnn2 -> s21
+  }
+
+  subgraph cluster_p3 {
+    label = "sentence 1"
+    low_rnn3 [label="low rnn step 1" shape=box]
+    s30 [label="sentence 0"]
+    s31 [label="sentence 1"]
+    low_rnn3 -> s30
+    low_rnn3 -> s31
+  }
+
+
+  chapter -> top_rnn0
+  chapter -> top_rnn1
+
+  top_rnn0 -> p0
+  top_rnn0 -> p1
+  top_rnn1 -> p2
+  top_rnn1 -> p3
+
+
+  p0 -> low_rnn0
+  p1 -> low_rnn1
+  p2 -> low_rnn2
+  p3 -> low_rnn3
+
+}
diff --git a/doc/fluid/images/rnn_2level_data.png b/doc/fluid/images/rnn_2level_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..4be81b2430717a6a506342a09fc26899568574c6
Binary files /dev/null and b/doc/fluid/images/rnn_2level_data.png differ
diff --git a/doc/fluid/images/single-thread@3x.png b/doc/fluid/images/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/fluid/images/single-thread@3x.png differ
diff --git a/doc/fluid/images/sparse_update.graffle b/doc/fluid/images/sparse_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..08d689a58f83698d8c1158ee3990ed8abf3a7a9a
Binary files /dev/null and b/doc/fluid/images/sparse_update.graffle differ
diff --git a/doc/fluid/images/sparse_update.png b/doc/fluid/images/sparse_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c872e6ac479f7d1b818a4a207956c43155d0ad7
Binary files /dev/null and b/doc/fluid/images/sparse_update.png differ
diff --git a/doc/fluid/images/test.dot b/doc/fluid/images/test.dot
new file mode 100644
index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a
--- /dev/null
+++ b/doc/fluid/images/test.dot
@@ -0,0 +1,35 @@
+
+digraph Test {
+    z -> generator -> G_img;
+    G_img -> discriminator -> D_f -> d_loss_f;
+    label0 -> d_loss_f -> d_loss;
+
+    img -> discriminator -> D_t -> d_loss_t;
+    label1 -> d_loss_t -> d_loss;
+
+    d_loss -> d_loss_t[color=red, style=dashed];
+    d_loss -> d_loss_f[color=red, style=dashed];
+    d_loss_t -> D_t[color=red, style=dashed];
+    d_loss_f -> D_f[color=red, style=dashed];
+    D_t -> discriminator[color=red, style=dashed];
+    D_f -> discriminator[color=red, style=dashed];
+
+    D_f -> g_loss;
+    label2 -> g_loss;
+
+    g_loss -> D_f[color=green, style=dashed];
+    D_f -> discriminator[color=green, style=dashed];
+    discriminator -> G_img[color=green, style=dashed];
+    G_img -> generator[color=green, style=dashed];
+
+    discriminator [color=red, shape=box];
+    generator [color=green, shape=box];
+    z [shape=diamond];
+    img [shape=diamond];
+    label0 [shape=diamond];
+    label1 [shape=diamond];
+    label2 [shape=diamond];
+
+    d_loss [color=red];
+    g_loss [color=green];
+}
diff --git a/doc/fluid/images/test.dot.png b/doc/fluid/images/test.dot.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55
Binary files /dev/null and b/doc/fluid/images/test.dot.png differ
diff --git a/doc/fluid/images/theta_star.gif b/doc/fluid/images/theta_star.gif
new file mode 100644
index 0000000000000000000000000000000000000000..dd24d33e124396be3fc410c9b12f33148f64efe2
Binary files /dev/null and b/doc/fluid/images/theta_star.gif differ
diff --git a/doc/fluid/images/timeline.jpeg b/doc/fluid/images/timeline.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..38ec3f80c982857531f30a8bb0fa26ea5bf05385
Binary files /dev/null and b/doc/fluid/images/timeline.jpeg differ
diff --git a/doc/fluid/images/tracing.jpeg b/doc/fluid/images/tracing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3a49fc4f8a401a9463b0157e2f38c164ca02dcc5
Binary files /dev/null and b/doc/fluid/images/tracing.jpeg differ
diff --git a/doc/fluid/index_cn.rst b/doc/fluid/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d878d192cae7ee9e8b8fdb4f615839c186fdf334
--- /dev/null
+++ b/doc/fluid/index_cn.rst
@@ -0,0 +1,12 @@
+ PaddlePaddle Fluid
+==========================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_cn.rst
+  build_and_install/index_cn.rst
+  design/index_cn.rst
+  howto/index_cn.rst
+  dev/index_cn.rst
+  faq/index_cn.rst
diff --git a/doc/fluid/index_en.rst b/doc/fluid/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2bc76b58982cf50e637d15cca0c5d78166aa73a9
--- /dev/null
+++ b/doc/fluid/index_en.rst
@@ -0,0 +1,12 @@
+ PaddlePaddle Fluid
+==========================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_en.rst
+  build_and_install/index_en.rst
+  design/index_en.rst
+  howto/index_en.rst
+  dev/index_en.rst
+  faq/index_en.rst
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 260b6c9fd1b364433cae098bacea77aa7fe9e266..76b82fd97f1ed642696c4414676b694ebda9ad81 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index e5757b86b43001bc6090d8edd0aaa5ff4fc476ee..5aa5c1381fa3fad4ebc181c7868da03ae0138016 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle
diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt
index 286fe8845cd7a909d4030540e72362864b536063..be957d37b14c618e9346251b3bd3dbaf1541773f 100644
--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
@@ -20,13 +20,15 @@ configure_file(
     "${BINARY_BUILD_DIR_EN}/conf.py"
     @ONLY)
 
-sphinx_add_target(paddle_docs
+sphinx_add_target(paddle_v2_docs
                   html
                   ${BINARY_BUILD_DIR_EN}
                   ${SPHINX_CACHE_DIR_EN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
+add_dependencies(paddle_v2_docs gen_proto_py paddle_python)
+
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -41,11 +43,13 @@ configure_file(
     "${BINARY_BUILD_DIR_CN}/conf.py"
     @ONLY)
 
-sphinx_add_target(paddle_docs_cn
+sphinx_add_target(paddle_v2_docs_cn
                   html
                   ${BINARY_BUILD_DIR_CN}
                   ${SPHINX_CACHE_DIR_CN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
 
+add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python)
+
 add_subdirectory(api)
diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt
index 2ad589e8a260e48d46cba2300d6e2bcd4bdd8019..2670a21a227546ffcee4f10f395feef3c58df9b4 100644
--- a/doc/v2/api/CMakeLists.txt
+++ b/doc/v2/api/CMakeLists.txt
@@ -12,9 +12,11 @@ configure_file(
     "${BINARY_BUILD_DIR_EN}/conf.py"
     @ONLY)
 
-sphinx_add_target(paddle_api_docs
+sphinx_add_target(paddle_v2_apis
                   html
                   ${BINARY_BUILD_DIR_EN}
                   ${SPHINX_CACHE_DIR_EN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/v2/build_and_install/index_en.rst b/doc/v2/build_and_install/index_en.rst
index 7e0ca5bcbdbad0a3c97c0045bb57b51137668161..5b3de0f8c3e5496060646b5ddb080d0d338a8bfa 100644
--- a/doc/v2/build_and_install/index_en.rst
+++ b/doc/v2/build_and_install/index_en.rst
@@ -1,32 +1,56 @@
-Install and Build
-=================
+install and Compile
+==========
 
 .. _install_steps:
 
-Install Steps
-++++++++
+PaddlePaddle provides various methods of installation for many different users
 
-You can choose either pip or Docker to complete your install:
+Focus on Deep Learning Model Development
+-----------------
+
+PaddlePaddle provides lots of packages of python wheel , that pip can install:
 
 .. toctree::
-   :maxdepth: 1
+	:maxdepth: 1
 
-   pip_install_en.rst
-   docker_install_en.rst
+	pip_install_en.rst
 
-Build from Source
------------------
+This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
+
+Follow the Bottom Frame
+----------
+
+PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
+
+.. toctree::
+	:maxdepth: 1
+
+	docker_install_en.rst
 
-..  warning::
+We recommend running PaddlePaddle in Docker. This method has the following advantages:
 
-    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
+- Does not require installation of third-party dependencies. 
+- Easy to share runtime environment. 
 
-..  toctree::
+Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below:
+
+.. toctree::
     :maxdepth: 1
 
-    build_from_source_en.md
+    build_from_source_en.rst
+
+.. warning::
+
+	One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming.
+
 
 FAQ
-++++++++++
+-----------
+
+For any problems during installation, please refer to the page below for answers:
+
+:ref:`常见问题解答 `
+
+If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community:
 
-`FAQ `_
+`创建issue `_
diff --git a/doc/fluid/design/interface/00.why_plain_c.md b/doc/v2/design/interface/00.why_plain_c.md
similarity index 100%
rename from doc/fluid/design/interface/00.why_plain_c.md
rename to doc/v2/design/interface/00.why_plain_c.md
diff --git a/doc/fluid/design/interface/01.inference_implementation.md b/doc/v2/design/interface/01.inference_implementation.md
similarity index 100%
rename from doc/fluid/design/interface/01.inference_implementation.md
rename to doc/v2/design/interface/01.inference_implementation.md
diff --git a/doc/v2/design/interface/index_cn.rst b/doc/v2/design/interface/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2509a5c5f4182d8ce3a16a3b7bd92c0d7bf5b056
--- /dev/null
+++ b/doc/v2/design/interface/index_cn.rst
@@ -0,0 +1,7 @@
+多语言接口
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  00.why_plain_c.md
diff --git a/doc/v2/design/interface/index_en.rst b/doc/v2/design/interface/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..356e58c39c5ef6ee5ee50ab999b85f88628bfb85
--- /dev/null
+++ b/doc/v2/design/interface/index_en.rst
@@ -0,0 +1,7 @@
+Multilingual Interface
+-----------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  00.why_plain_c.md
diff --git a/doc/v2/design/mkl/mkldnn.md b/doc/v2/design/mkl/mkldnn.md
index e2fe1e6b26ffa73fda81863abfadf697c0acbfcf..1bd2e7bc34ee79eb753b3520d97e5e7beca89b0b 100644
--- a/doc/v2/design/mkl/mkldnn.md
+++ b/doc/v2/design/mkl/mkldnn.md
@@ -44,7 +44,7 @@ MKL,MKLML以及MKL-DNN三者关系如下表:
 
 | Name        |  Open Source     | License     | Descriptions  |
 | :---------- | :--------------- | :---------- | :------------ |
-|   MKL       |     No           | Proprietary | Accelerate math processing routines | 
+|   MKL       |     No           | Proprietary | Accelerate math processing routines |
 |   MKLML     |     No           | Proprietary | Small package of MKL, especially for Machine Learning |
 |   MKL-DNN   |     Yes          | Apache 2.0  | Accelerate primitives processing routines especially for Deep Neural Networks  |
 
@@ -89,7 +89,7 @@ PaddlePaddle/Paddle
 ### CMake
 在`CMakeLists.txt`中提供一个与MKL有关的总开关:`WITH_MKL`,它负责决定编译时是否使用MKLML和MKL-DNN
 
-- `WITH_MKLML` 控制是否使用MKLML库。 
+- `WITH_MKLML` 控制是否使用MKLML库。
 当打开`WITH_MKL`时,会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。
 编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
 MKLML的库目前都是动态库,主要包括`libiomp5.so`和`libmklml_intel.so`。
@@ -172,7 +172,7 @@ if use_mkldnn
     self.layer_type = mkldnn_*
 ```
 
-所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。 
+所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。
 
 同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
 
diff --git a/doc/v2/dev/index_en.rst b/doc/v2/dev/index_en.rst
index 549f5fa9aace7eb699d229e5f61fe10ae4ed4d66..36516b7953224e799e1065fd7930509eec0aa650 100644
--- a/doc/v2/dev/index_en.rst
+++ b/doc/v2/dev/index_en.rst
@@ -1,9 +1,27 @@
 Development
 ------------
 
+
+PaddlePaddle adheres to the following three sections of code and document specifications.
+
+
+PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages,which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development.
 ..  toctree::
   :maxdepth: 1
 
   contribute_to_paddle_en.md
+
+
+PaddlePaddle is well documented in English and Chinese. We recommend using the English version of the documents and problem description. The design documents focus on problem descriptions, backgrounds, and are followed by solutions. As documents are generated by Sphinx, code comments should comply with the Sphinx documentation standard. We recommend to use the paddlepaddle.org tool to compile and generate and preview documents locally. Please refer to:
+
+..  toctree::
+  :maxdepth: 1
+
   write_docs_en.rst
+
+PaddlePaddle V2 defines new operations by adding new Layers. You can implement various complex layers by combining basic APIs to satisfy most applications. If you want to customize layer, please refer to the following, and welcome to propose patch.
+
+..  toctree::
+  :maxdepth: 1
+
   new_layer_en.rst
diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst
index a055bb04c0c093c9159290067e5ccbd2525cd519..4231f2bb5cd800c0cd86835b5d07e491fcde4989 100644
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -2,13 +2,14 @@
 如何贡献文档
 #############
 
-PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成,也可以利用paddlepaddle.org工具来编译和预览文档。
+PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成的,PaddlePaddle.org工具可以帮助我们实现这一编译过程,并提供更好的预览效果。
 
 如何构建文档
 ============
 
 PaddlePaddle的文档构建有两种方式,分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具,两种方式都有各自的优点,前者方便预览,后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。
 
+我们建议使用PaddlePaddle.org工具来构建文档。
 
 使用PaddlePaddle.org工具
 ------------------------
@@ -31,7 +32,7 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D
     docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
 
 注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
-之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档
 编译后的文件将被存储在工作目录 /.ppo_workspace/content。
 
 如果不想使用Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
@@ -56,7 +57,7 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D
     python manage.py runserver
 
 工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
-之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档。
 编译后的文件将被存储在工作目录 /.ppo_workspace/content。
 
 想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 `_ 。
@@ -64,39 +65,55 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D
 不使用PaddlePaddle.org工具
 --------------------------
 
-使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即
+使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。该方法与 `从源码编译PaddlePaddle `_ 相似,通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行,在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档,具体步骤如下:
 
-[TBD]
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+注:上述命令把当前目录(源码根目录)映射为 container 里的 :code:`/paddle` 目录。
+
+编译完成后,会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录,在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录,分别进入这些目录下,执行以下命令:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。
 
 如果不想使用Docker,也可以使用以下命令直接构建PaddlePaddle文档,即
 
 .. code-block:: bash
 
-   mkdir paddle
-   cd paddle
    git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
    mkdir -p build
    cd build
    cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 
    # 如果只需要构建使用文档,则执行以下命令
-   make -j $processors gen_proto_py
-   make -j $processors paddle_docs paddle_docs_cn
+   make -j $processors paddle_docs
 
    # 如果只需要构建API,则执行以下命令
-   make -j $processors gen_proto_py framework_py_proto
-   make -j $processors copy_paddle_pybind
-   make -j $processors paddle_api_docs
+   make -j $processors paddle_apis
 
 其中$processors代表启动和CPU核一样多的进程来并行编译,可以根据本机的CPU核数设置相应的值。
 
-编译完成后,进入 ``doc/v2`` 目录,如果选择构建文档则会在该目录下生成 ``cn/html/`` 、 ``en/html`` 两个子目录,选择构建API则会生成 ``api/en/html`` 目录,分别进入这些目录下,执行以下命令:
+编译完成后,同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录,如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录,选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录,分别进入这些子目录下,执行以下命令:
 
 .. code-block:: bash
 
    python -m SimpleHTTPServer 8088
 
-在浏览器中输入http://localhost:8088就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。
 
 ..  image:: src/doc_en.png
     :align: center
diff --git a/doc/v2/dev/write_docs_en.rst b/doc/v2/dev/write_docs_en.rst
index f3408a84269aaeef19986c220454555fbbe30e23..6105455e202e4704aa25f0fd9916b9b61a569702 100644
--- a/doc/v2/dev/write_docs_en.rst
+++ b/doc/v2/dev/write_docs_en.rst
@@ -2,21 +2,20 @@
 Contribute Documentation
 ########################
 
-PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
-Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
-When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
+PaddlePaddle's documentation includes both Chinese and English versions. The documentation is built using the ``cmake`` command to drive the ``sphinx`` compiler. The PaddlePaddle.org tool helps us to implement this compilation process and provides better preview results.
 
-How to Build Documentations
-============
+How to build Documentation
+===========================
 
-We recommend using PaddlePaddle.org tool to build documentation
+PaddlePaddle's documentation is built in two ways: using the PaddlePaddle.org tool and without using it. Both methods have their own advantages. The former facilitates previewing, while the latter facilitates debugging by the developer. We could choose to build the documentation with Docker or without it in each of the above ways.
 
+We recommend using PaddlePaddle.org tool to build documentation.
 
-Use PaddlePaddle.org tool
---------------
-This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
+Using PaddlePaddle.org tool
+-----------------------------
+This is the recommended method to build documentation, because it can automatically compile the documentation and preview the documentation directly in a web page. Note that, although you can preview the documentation in other ways, its style may not be consistent with the official website. Compiling with the PaddlePaddle.org tool produces a preview that will be consistent with the official website documentation style.
 
-The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
+The PaddlePaddle.org tool can be used with Docker and Docker needs to be installed first. Please refer to `Docker's official website `_ on how to install Docker. After installing Docker, you may use the following commands to activate the tool
 
 ..  code-block:: bash
 
@@ -32,8 +31,8 @@ The tool uses Docker, please install it on your system. Please check Docker offi
     # Please specify the working directory through -v
     docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
 
-Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
-Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run commands
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation.
 The compiled documentations will be stored in /.ppo_workspace/content
 
 
@@ -58,19 +57,79 @@ If you don't wish to use Docker, you can also activate the tool through Django.
     pip install -r requirements.txt
     python manage.py runserver
 
-Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+Specify the PaddlePaddle working directory for the environment variable CONTENT_DIR so that the tool could find where the working directory is.
+
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation
 The compiled documentations will be stored in /.ppo_workspace/content
 
-If you want to learn more on the PaddlePaddle.org, please `click here `_ 。
+Please `click here `_ for more information about the PaddlePaddle.org tool.
+
+
+Manually Building the Documentation
+-------------------------------------
+
+Build PaddlePaddle's documentation with Docker,you need to install Docker first. Please refer to `Docker's official website `_ on how to install Docker. This method is quite similar to ` Build From Sources `_ , by constructing, from source code, a docker image that can be used to build PaddlePaddle documentation. Enter the Docker container and use the script ``build.sh`` in the source directory to build the PaddlePaddle documentation. The specific steps are as follows:
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # Construct a docker image from source code
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # Use build.sh to build PaddlePaddle documentation
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container.
+
+After compiling, there should be two generated directories: ``doc/v2`` and ``doc/fluid``, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page.
+
+If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
+
+.. code-block:: bash
+
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   mkdir -p build
+   cd build
+   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+
+   # If you only need to build documents, use the following commands
+   make -j $processors paddle_docs
+
+   # If you only need to build APIs, use the following commands
+   make -j $processors paddle_apis
+
+$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
+
+After compiling, there also should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html``  will be generated in both two directories. If you chose to build APIs,a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. The following figure is an example of the built ``v2`` 's English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
 
-How to write Documentations
-============
+..  image:: src/doc_en.png
+    :align: center
+    :scale: 60 %
 
-PaddlePaddle uses `sphinx`_ to compile documentations,Please check sphinx official website for more detail.
+How to write Documentation
+===========================
 
+PaddlePaddle uses `sphinx`_ to compile documentation,Please check sphinx official website for more detail.
 
 How to update www.paddlepaddle.org
-============================
+===================================
 
 Please create PRs and submit them to github, please check `Contribute Code `_ 。
 PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs `_ and
diff --git a/doc/v2/faq/build_and_install/index_cn.rst b/doc/v2/faq/build_and_install/index_cn.rst
index 7c7e896d187e4fe1544d7ec933fa4fa9f24df3cd..f292684fb5fe2df06db5239e7f43fdfa1dd2f2bd 100644
--- a/doc/v2/faq/build_and_install/index_cn.rst
+++ b/doc/v2/faq/build_and_install/index_cn.rst
@@ -139,3 +139,77 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二
     touch ../extern_mklml-stamp/extern_mklml-download
 
     // 4. 接着编译即可
+
+9. 在Mac上无法安装numpy等Python包,权限错误
+------------------
+
+Mac上对自带的Python和包有严格的权限保护,最好不要在自带的Python上安装。建议用virtualenv建立一个新的Python环境来操作。
+
+virtualenv的基本原理是将机器上的Python运行所需的运行环境完整地拷贝一份。我们可以在一台机器上制造多份拷贝,并在这多个拷贝之间自由切换,这样就相当于在一台机器上拥有了多个相互隔离、互不干扰的Python环境。
+
+下面简单介绍下如何用virtualenv为Paddle生成一个专用的Python环境:
+
+安装virtualenv:
+::::::::::::::::
+
+virtualenv本身也是Python的一个包,可以用pip进行安装:
+
+..  code-block:: bash
+
+    sudo -H pip install virtualenv
+
+由于virtualenv需要安装给系统自带的Python,因此需要使用sudo权限。
+
+创建一个新的Python运行环境:
+:::::::::::::::::::
+
+..  code-block:: bash
+
+    virtualenv --no-site-packages paddle
+
+--no-site-packages 参数表示不拷贝已有的任何第三方包,创造一个完全干净的新Python环境。后面的paddle是我们为这个新创建的环境取的名字。
+
+执行完这一步后,当前目录下应该会出现一个名为paddle(或者你取的其他名字)的目录。这个目录里保存了运行一个Python环境所需要的各种文件。
+
+启动运行环境:
+::::::::::::::::
+
+..  code-block:: bash
+
+    source paddle/bin/activate
+
+执行后会发现命令提示符前面增加了(paddle)字样,说明已经成功启动了名为‘paddle’的Python环境。执行which python,可以发现使用的已经是刚刚创建的paddle目录下的Python。
+
+在这个环境中,我们可以自由地进行Paddle的安装、使用和开发工作,无需担心对系统自带Python的影响。
+
+退出运行环境:
+:::::::::::::::
+
+直接执行:
+
+..  code-block:: bash
+
+    deactivate
+
+可以看到命令提示符前面的(paddle)字样消失。
+
+自动启动某一Python环境:
+::::::::::::::::
+
+如果我们经常使用Paddle,我们每次打开终端后都需要执行一下source paddle/bin/activate来启动环境,比较繁琐。为了简便,可以修改终端的配置文件,来让终端每次启动后自动启动特定的Python环境。
+
+执行:
+
+..  code-block:: bash
+
+    vi ~/.bash_profile
+
+打开终端配置文件,并在文件的最后添加一行:
+
+..  code-block:: bash
+
+    source paddle/bin/activate
+
+保存并关闭文件。
+
+这样,每次打开终端时就会自动启动名为‘paddle’的Python环境了。
diff --git a/doc/v2/faq/build_and_install/index_en.rst b/doc/v2/faq/build_and_install/index_en.rst
index 614db457d715665073cec1a495d4d7df6887532f..7488ed8137d57785f36b9f1e1ed1269f864960bc 100644
--- a/doc/v2/faq/build_and_install/index_en.rst
+++ b/doc/v2/faq/build_and_install/index_en.rst
@@ -1,5 +1,143 @@
-############################
-Install, Build and Unit test
-############################
+.. _install_faq:
 
-TBD
+###############################
+Compile, Install, and Unit Test
+###############################
+
+..  contents::
+
+1. Insufficient CUDA driver version
+----------------------------------------------------------------
+
+Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory.
+You can solve the issue by running the following commands:
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation `_ .
+
+
+2. Version mismatch between PythonLibs and PythonInterpreter
+----------------------------------------------------------------
+
+It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows.
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE= -DPYTHON_LIBRARY=  -DPYTHON_INCLUDE_DIR=
+
+You should specify ````, ````, ```` to your local paths.
+
+3. PaddlePaddle version is 0.0.0
+------------------------------------------------
+This issue would happen when you run the code  `paddle version` or `cmake ..`
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+
+You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake`
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1.
+
+You can upgrade Pip with the following command\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation.
+
+If the system supports :code:`linux_x86_64` and  the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest 
+
+if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again.
+
+
+5. ImportError: No module named v2
+----------------------------------
+Please uninstall Paddle V1 if you have installed it before.
+
+..  code-block:: bash
+
+    pip uninstall py_paddle paddle
+
+Then install Python for PaddlePaddle , enter the build directory and run the following commands
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. Illegal instruction
+-----------------------
+This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version.
+
+7.  Python unittest fails
+--------------------------------
+
+If the following python unittest testcases fail:
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+
+Please check the PaddlePaddle unittest logs which may suggest the following:
+
+..  code-block:: bash
+
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+The solution is:
+
+* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory.  Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package.
+
+
+8. Failed to download the MKLML library
+----------------------------------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2
+    make[1]: *** waiting for the unfinished  jobs....
+
+Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully.
+
+The solution is: manually download and install, the specific steps are as follows.
+
+..  code-block:: bash
+
+    // 1. enter the directory
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. check the size of the package, normally 75M, if less than 75M, the download fails
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. manually download and unzip and make the download success tag:
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. then compile
+    
diff --git a/doc/v2/faq/cluster/index_en.rst b/doc/v2/faq/cluster/index_en.rst
index 855b7e8e53307b82a72c156be4ef509e27edf822..fa942a09625bef78b28456beeb735272b686e061 100644
--- a/doc/v2/faq/cluster/index_en.rst
+++ b/doc/v2/faq/cluster/index_en.rst
@@ -2,4 +2,15 @@
 Cluster Training and Prediction
 ###############################
 
-TBD
+.. contents::
+
+1. Network connection errors in the log during multi-node cluster training
+------------------------------------------------
+There are maybe some errors in the log belonging to network connection problem during multi-node cluster training, for example, :code:`Connection reset by peer`.
+This kind of error is usually caused by the abnormal exit of a training process in some node, and the other nodes cannot connect with this node any longer. Steps to troubleshoot the problem are as follows:
+
+* Find the first error in the :code:`train.log`, :code:`server.log`, check whether other fault casued the problem, such as FPE, lacking of memory or disk.
+
+* If the first error in server.log says "Address already used", this may be caused by the port conflict of the non-exclusive execution. Connect the sys-admin to check if the current MPI cluster supports jobs submitted with parameter :code:`resource=full`. If the current MPI cluster does not support this parameter, change the server port and try agian.
+
+* If the current MPI cluster does not support exclusive pattern which allows a process to occupy the whole node, ask the administrator to replace or update the this cluster.
diff --git a/doc/v2/faq/index_en.rst b/doc/v2/faq/index_en.rst
index 57df868f760038b25fae30df7ab20a68875ad36a..3fa220792b252617848a1c76bc2be49928e35f64 100644
--- a/doc/v2/faq/index_en.rst
+++ b/doc/v2/faq/index_en.rst
@@ -1,7 +1,8 @@
 FAQ
 ====
 
- 
+This document provides answers to some of the frequently asked questions about PaddlePaddle. If you have a question that is not covered here, please go to `PaddlePaddle Community `_ , to find an answer or submit new `issue `_  , we will reply in time.
+
 ..  toctree::
   :maxdepth: 1
 
diff --git a/doc/v2/faq/model/index_en.rst b/doc/v2/faq/model/index_en.rst
index cb26f59655f97dc28a2047994643ae16b8857964..67a33e08e192e5627ac3b0abd76e979f21ed2079 100644
--- a/doc/v2/faq/model/index_en.rst
+++ b/doc/v2/faq/model/index_en.rst
@@ -2,4 +2,80 @@
 Model Configuration
 ###################
 
-TBD
+..  contents::
+
+1. How to deal with error :code:`Duplicated layer name`
+----------------------------------------------------------
+
+The general reason for this error is that users may have set the same value for the attribute :code:`name` in different layers. Try to find out the :code:`name` attribute with the same value in diffrent layers and set them differently.
+
+2. How to use :code:`paddle.layer.memory`'s attribute :code:`name`
+----------------------------------------------------------------------
+
+* :code:`paddle.layer.memory` is used to get the output of a layer's last timestep and the layer is specified by the attribute :code:`name` . Thus,  :code:`paddle.layer.memory` will associate with the layer that has the same value of attribute :code:`name` , and uses the output of the layer's last timestep as the input of its current timestep.
+
+* All the PaddlePaddle's layers have a unique name, which is set by the attribute :code:`name` . PaddlePaddle will automatically set it for the user when it is not explicitly set. :code:`paddle.layer.memory` is not a real layer, its name is set by the attribute :code:`memory_name`  and PaddlePaddle will also automatically set it when the user does not explicitly set. The :code:`paddle.layer.memory` attribute :code:`name` is used to specify the layer it is associated with, and needs to be explicitly set by the user.
+
+
+3. What is the difference between the two ways of using dropout
+-----------------------------------------------------------------
+
+* There are two ways to use dropout in PaddlePaddle
+
+  * Set the :code:`drop_rate` parameter in the layer's :code:`layer_atter` attribute. Take :code:`paddle.layer.fc` as an example:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
+
+  * Use :code:`paddle.layer.dropout` layer. Take :code:`paddle.layer.fc` as an example:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input)
+      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
+
+* :code:`paddle.layer.dropout` actually uses the :code:`paddle.layer.add_to` layer and sets :code:`drop_rate` as the previous method. This method is very memory intensive.
+
+* PaddlePaddle implements dropout in the activation function rather than in the layer.
+
+* :code:`paddle.layer.lstmemory`, :code:`paddle.layer.grumemory`, :code:`paddle.layer.recurrent` implement activation of output in an unusual way, so we cannot use dropout by setting :code:`drop_rate` . To use dropout for these layers, we could use the second method, which is to use :code:`paddle.layer.dropout`.
+
+4. The differences between different recurrent layers
+--------------------------------------------------------
+Take LSTM as an example. There are several kinds of recurrent layers in PaddlePaddle:
+
+* :code:`paddle.layer.lstmemory`
+* :code:`paddle.networks.simple_lstm`
+* :code:`paddle.networks.lstmemory_group`
+* :code:`paddle.networks.bidirectional_lstm`
+
+According to implementations, recurrent layer can be classified into 2 types:
+
+1. Recurrent layer implemented by recurrent_group:
+
+  * Using this type of recurrent layers, users can access the intermediate value calculated by the recurrent unit within a timestep (eg: hidden states, memory cells, etc.)
+  * :code:`paddle.networks.lstmemory_group` belongs to this type of recurrent layers.
+
+2. Recurrent layer implemented as a complete operation:
+
+  * Users can only access output values when using this type of recurrent layers.
+  * :code:`paddle.networks.lstmemory_group` , :code:`paddle.networks.simple_lstm` and  :code:`paddle.networks.bidirectional_lstm` belong to this type of recurrent layer;
+
+By implementing recurrent layer as a complete operation, CPU and GPU calculations can be optimized. Therefore, the second type of recurrent layer is more efficient than the first one. In practical applications, we propose to use the second type of recurrent layers if there is no need to access the intermediate variable of LSTM.
+
+In addition, PaddlePaddle also contains a kind of LSTM calculation unit: :code:`paddle.networks.lstmemory_unit`:
+
+  * Unlike the recurrent layer described above, :code:`paddle.networks.lstmemory_unit` defines the computational process of an LSTM unit in a timestep. It is not a complete recurrent layer, nor can it receive sequence data as input.
+  * :code:`paddle.networks.lstmemory_unit` can only be used as a step function in recurrent_group.
+
+5. Can Softmax's calculation dimension be specified?
+--------------------------------------------------------------------
+
+We can't specify calculation dimension for PaddlePaddle's softmax. It can only be calculated by rows.
+In image tasks, for NCHW, if you need to calculate softmax in C dimension, you could use :code:`paddle.layer.switch_order` to change the dimension order, that is, convert NCHW to NHWC, then do the reshape operation and calculate softmax.
+
+6. Does PaddlePaddle support variable-dimensional data inputs
+----------------------------------------------------------------
+
+PaddlePaddle provides :code:`paddle.data_type.dense_array` to support variable-dimensional data input. Simply set the dimension of the data layer to a value larger than the dimension of the input data for occupancy.
diff --git a/doc/v2/howto/capi/compile_paddle_lib_en.md b/doc/v2/howto/capi/compile_paddle_lib_en.md
index 11d69b9b79c1a41898d3060d3fe25a31330334a3..6212a3081116d988630706e83d2349dd200b73ab 100644
--- a/doc/v2/howto/capi/compile_paddle_lib_en.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_en.md
@@ -1,3 +1,175 @@
 ## Install and Build
 
-TBD
+### Download & Install 
+
+  Download the latest C-API development package from CI system and install. You can find the required version in the table below:
+
+
+### From source
+
+  Users can also compile the C-API library from PaddlePaddle source code by compiling with the following compilation options:
+  
+
+
+
+| Options+ | Value+ | 
+
+
+
+| WITH_C_API+ | ON+ | 
+
+| WITH_PYTHON+ | OFF(recommended)+ | 
+
+| WITH_SWIG_PY+ | OFF(recommended)+ | 
+
+| WITH_GOLANG+ | OFF(recommended)+ | 
+
+| WITH_GPU+ | ON/OFF+ | 
+
+| WITH_MKL+ | ON/OFF+ | 
+
+It is best to set up with recommended values to avoid linking with unnecessary libraries. Set other compilation options as you need.
+
+Pull the latest following code snippet from github, and configure compilation options(replace PADDLE_ROOT with the installation path of the PaddlePaddle C-API inference library):
+
+```shell
+PADDLE_ROOT=/path/of/capi
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_GOLANG=OFF \
+      -DWITH_PYTHON=OFF \
+      -DWITH_MKL=OFF \
+      -DWITH_GPU=OFF  \
+      ..
+```
+
+After running the above code to generate Makefile , run: `make && make install`.  After successful compilation, the dependencies required by C-API(includes: (1)PaddlePaddle inference library and header files; (2) Third-party libraries and header files) will be stored in the `PADDLE_ROOT` directory.
+
+If the compilation is successful, see the following directory structure under `PADDLE_ROOT`(includes PaddlePaddle header files and libraries, and third-party libraries and header files(determined by the link methods if necessary)):
+
+```text
+├── include
+│   └── paddle
+│       ├── arguments.h
+│       ├── capi.h
+│       ├── capi_private.h
+│       ├── config.h
+│       ├── error.h
+│       ├── gradient_machine.h
+│       ├── main.h
+│       ├── matrix.h
+│       ├── paddle_capi.map
+│       └── vector.h
+├── lib
+│   ├── libpaddle_capi_engine.a
+│   ├── libpaddle_capi_layers.a
+│   ├── libpaddle_capi_shared.so
+│   └── libpaddle_capi_whole.a
+└── third_party
+    ├── gflags
+    │   ├── include
+    │   │   └── gflags
+    │   │       ├── gflags_completions.h
+    │   │       ├── gflags_declare.h
+    │   │       ...
+    │   └── lib
+    │       └── libgflags.a
+    ├── glog
+    │   ├── include
+    │   │   └── glog
+    │   │       ├── config.h
+    │   │       ...
+    │   └── lib
+    │       └── libglog.a
+    ├── openblas
+    │   ├── include
+    │   │   ├── cblas.h
+    │   │   ...
+    │   └── lib
+    │       ...
+    ├── protobuf
+    │   ├── include
+    │   │   └── google
+    │   │       └── protobuf
+    │   │           ...
+    │   └── lib
+    │       └── libprotobuf-lite.a
+    └── zlib
+        ├── include
+        │   ...
+        └── lib
+            ...
+
+```
+
+### Linking Description:
+
+There are three kinds of linking methods:
+
+1. Linking with dynamic library `libpaddle_capi_shared.so`(This way is much more convenient and easier, **Without special requirements, it is recommended**), refer to the following:
+    1. Compiling with CPU version and using `OpenBLAS`; only need to link one library named `libpaddle_capi_shared.so` to develop prediction program through C-API.
+    1. Compiling with CPU version and using `MKL` lib, you need to link MKL library directly to develop prediction program through PaddlePaddle C-API, due to `MKL` has its own dynamic library.
+    1. Compiling with GPU version, CUDA library will be loaded dynamically on prediction program run-time, and also set CUDA library to  `LD_LIBRARY_PATH` environment variable.
+
+2. Linking with static library `libpaddle_capi_whole.a`,refer to the following:
+    1. Specify `-Wl,--whole-archive` linking options.
+    1. Explicitly link third-party libraries such as `gflags`、`glog`、`libz`、`protobuf` .etc, you can find them under `PADDLE_ROOT/third_party` directory.
+    1. Use OpenBLAS library if compiling C-API,must explicitly link `libopenblas.a`.
+    1. Use MKL when compiling C-API, must explicitly link MKL dynamic library.
+
+3. Linking with static library `libpaddle_capi_layers.a` and `libpaddle_capi_engine.a`,refer to the following:
+    1. This linking methods is mainly used for mobile prediction.
+    1. Split `libpaddle_capi_whole.a` into two static linking library at least to reduce the size of linking libraries.
+    1. Specify `-Wl,--whole-archive -lpaddle_capi_layers`  and  `-Wl,--no-whole-archive -lpaddle_capi_engine` for linking.
+    1. The third-party dependencies need explicitly link same as method 2 above. 
diff --git a/doc/v2/howto/capi/index_en.rst b/doc/v2/howto/capi/index_en.rst
index 2cbbe362fd8e06abe9866d998f60fbb3458a80b5..4ec39c9d5223442cf6872edaf7befeb5053b538e 100644
--- a/doc/v2/howto/capi/index_en.rst
+++ b/doc/v2/howto/capi/index_en.rst
@@ -1,6 +1,23 @@
-C-API Prediction Library
+C-API Inference Library
 ========================
 
+After we train a neural network, we use it to do inference. Inference is the process of preparing input data and propagating it through the model to produce the result.
+
+Compared with model training, prediction has the following features:
+
+#. Inference does not require backpropagation and parameter updates, as required during training.
+#. Labels are not needed in prediction.
+#. Most of the time, predictions need to be integrated with the user system.
+
+Therefore, the model prediction SDK needs to be designed separately and has the following features:
+
+#. The predictive SDK does not include backpropagation and parameter updates to reduce the size of the SDK.
+#. The predictive SDK needs a simple user interface for ease of use.
+#. Since the input data may have a variety of structures, the format of the input data is clearly and compactly packaged.
+#. In order to be compatible with user's system, the SDK's interface must conform to the C-standard interface.
+
+PaddlePaddle provides C-API to solve the above problem. Following are the guidelines to use the C-API:
+
 ..  toctree::
   :maxdepth: 1
 
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
index bc3d50b3ffd3b703a3a656caa1f96bdcf683f68b..dee1b7554f97af17989c3f7739d8feea3b6b8e3f 100644
--- a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
@@ -1,3 +1,372 @@
-# Kubernetes Distributed
+# Distributed Training on Kubernetes
 
-TBD
+We introduced how to create a PaddlePaddle Job with a single node on Kuberentes in the
+previous document.
+In this article, we will introduce how to create a PaddlePaddle job with multiple nodes
+on Kubernetes cluster.
+
+## Overall Architecture
+
+Before creating a training job, the users need to slice the training data and deploy
+the Python scripts along with it into the distributed file system
+(We can use the different type of Kuberentes Volumes to mount different distributed
+file systems). Before training starts, The program will copy the training data into the
+Container and also save the models at the same path during training. The global architecture
+is as follows:
+
+
+
+The above figure describes a distributed training architecture which contains 3 nodes, each 
+Pod mounts a folder of the distributed file system to save training data and models
+by Kubernetes Volume. Kubernetes created 3 Pods for this training phase and scheduled these on
+3 nodes, each Pod has a PaddlePaddle container. After the containers car created,
+PaddlePaddle starts up the communication between PServer and Trainer and read training
+data for this training job.
+
+As the description above, we can start up a PaddlePaddle distributed training job on a 
+Kubernetes ready cluster with the following steps:
+
+1. [Build PaddlePaddle Docker Image](#Build a Docker Image)
+1. [Split training data and upload to the distributed file system](#Upload Training Data)
+1. [Edit a YAML file and create a Kubernetes Job](#Create a Job)
+1. [Check the output](#Check The Output)
+
+We will introduce these steps as follows:
+
+### Build a Docker Image
+
+Training docker image needs to package the paddle pserver and paddle trainer runtimes, as well as two more processes before we can kick off the training:
+
+- Copying the training data into container.
+- Generating the initialization arguments for `Paddle PServer` and `Paddle Training` processes.
+
+Since the paddlepaddle official docker image already has the runtimes we need, we'll take it as the base image and pack some additional scripts for the processes mentioned above to build our training image. for more detail, please find from the following link:
+- https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile
+
+
+```bash
+$ cd doc/howto/usage/k8s/src/k8s_train
+$ docker build -t [YOUR_REPO]/paddle:mypaddle .
+```
+
+And then upload the new Docker Image to a Docker hub:
+
+```bash
+docker push  [YOUR_REPO]/paddle:mypaddle
+```
+
+**[NOTE]**, in the above command arguments, `[YOUR_REPO]` represents your Docker repository,
+you need to use your repository instead of it. We will replace it with your respository name to
+represent the Docker Image which built in this step.
+
+### Prepare Training Data
+
+We can download and split the training job by creating a Kubernetes Job, or custom your image
+by editing [k8s_train](./src/k8s_train/).
+
+Before creating a Job, we need to bind a [persistenVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes) by the different type of
+the different file system, the generated dataset would be saved on this volume.
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      hostNetwork: true
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/mnt"
+          name: nfs
+        env:
+        - name: OUT_DIR
+          value: /home/work/mfs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: nfs
+          persistentVolumeClaim:
+            claimName: mfs
+      restartPolicy: Never
+```
+
+Create the Job with the following command:
+
+```bash
+> kubectl create -f xxx.yaml
+```
+
+If created successfully, you can see some information like this:
+
+```base
+[root@paddle-kubernetes-node0 nfsdir]$ tree -d
+.
+`-- paddle-cluster-job
+    |-- 0
+    |   `-- data
+    |-- 1
+    |   `-- data
+    |-- 2
+    |   `-- data
+    |-- output
+    |-- quick_start
+```
+
+The `paddle-cluster-job` above is the job name for this training job; we need 3
+PaddlePaddle training nodes and save the split training data in `paddle-cluster-job` path,
+the folder `0`, `1` and `2` represents the `training_id` on each node, `quick_start` folder is used to store training data, `output` folder is used to store the models and logs.
+
+
+### Create a Job
+
+Kubernetes allow users to create objects with YAML files, and we can use a command-line tool
+to create it.
+
+The Job YAML file describes that which Docker Image would be used in this training job, how much nodes would be created, what's the startup arguments of `Paddle PServer/Trainer` process and what's the type of Volumes. You can find the details of the YAML filed in
+[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job).
+The following is an example for this training job:
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/work/mfs
+      containers:
+      - name: trainer
+        image: [YOUR_REPO]/paddle:mypaddle
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+      restartPolicy: Never
+```
+
+In the above YAML file:
+- `metadata.name`, The job name.
+- `parallelism`, Whether the Kubernetes Job would create `parallelism` Pods at the same time.
+- `completions`, The Job would become the success status only when the number of successful Pod(the exit code is 0)
+  is equal to `completions`.
+- `volumeMounts`, the name field `jobpath` is a key, the `mountPath` field represents
+  the path in the container, and we can define the `jobpath` in `volumes` filed, use `hostPath`
+  to configure the host path we want to mount.
+- `env`, the environment variables in the Container, we pass some startup arguments by
+  this approach, some details are as following:
+  - JOB_PATH:the mount path in the container
+  - JOB_NAME:the job name
+  - TRAIN_CONFIG_DIR:the job path in the container, we can find the training data path by
+    combine with JOB_NAME.
+  - CONF_PADDLE_NIC: the argument `--nics` of `Paddle PServer` process, the network
+    device name.
+  - CONF_PADDLE_PORT: the argument `--port` of `Paddle PServer` process.
+  - CONF_PADDLE_PORTS_NUM: the argument `--ports_num` of `Paddle PServer`, the port number
+    for dense prameter update. 
+  - CONF_PADDLE_PORTS_NUM_SPARSE:the argument `--ports_num_for_sparse` of `Paddle PServer`,
+    the port number for sparse parameter update.
+  - CONF_PADDLE_GRADIENT_NUM:the number of training node, the argument 
+  `--num_gradient_servers` of `Paddle PServer` and `Paddle Trainer`.
+
+You can find some details information at [here]
+(http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。
+
+We can use the command-line tool of Kubernetes to create a Job when we finish the YAML file:
+
+```bash
+kubectl create -f job.yaml
+```
+
+Upon successful creation, Kubernetes would create 3 Pods as PaddlePaddle training node,
+pull the Docker image and begin to train.
+
+
+### Checkout the Output
+
+At the process of training, we can check the logs and the output models which is stored in
+the `output` folder.
+
+**NOTE**, `node_0`, `node_1` and `node_2` represent the
+`trainer_id` of the PaddlePaddle training job rather than the node id of Kubernetes.
+
+```bash
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+
+We can checkout the status of each training Pod by viewing the logs:
+
+```bash
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0
+    --log_period=50 --dot_period=10 --saving_period=1
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
+
+## Some Additional Details
+
+### Using Environment Variables
+
+Usually we use the environment varialbes to configurate the PaddlePaddle Job which runs in
+Kubernetes, `start_paddle.py` provides a start up script to convert the environment variable
+to the start up arguments of PaddlePaddle process:
+
+```bash
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+```
+
+### Communication between Pods
+
+At the begin of `start_paddle.py`, it would initializes and parses the arguments.
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+And then query the status of all the other Pods of this Job by the function `getPodList()`, and fetch `triner_id` by the function `getIdMap(podlist)` if all the Pods status is `RUNNING`.
+
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+
+**NOTE**: `getPodList()` would prefetch all the Pods in the current namespace, if some 
+Pods are alreay running, it may cause some error. We will use [statfulesets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets) instead of
+Kubernetes Pod or Replicaset in the future.
+
+The function `getIdMap(podlist)` fetches IPs addresses of `podlist` and then sort them
+to generate `trainer_id`.
+
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+After getting the `idMap`, we can generate the arguments of `Paddle PServer` and `Paddle Trainer`
+so that we can start up them by `startPaddle(idMap, train_args_dict)`.
+
+### Create Job
+
+The main goal of `startPaddle` is generating the arguments of `Paddle PServer` and
+`Paddle Trainer` processes. Take `Paddle Trainer` as an example, we parse the
+environment variable and then get `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM` and etc...,
+finally find `trainerId` from `idMap` according to its IP address.
+
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
diff --git a/doc/v2/howto/cmd_parameter/index_en.rst b/doc/v2/howto/cmd_parameter/index_en.rst
index 0e3c72d27aca063f1b6f1c23e55718dba373c40a..f49683948ef78f363e2439cc25332431830eeb24 100644
--- a/doc/v2/howto/cmd_parameter/index_en.rst
+++ b/doc/v2/howto/cmd_parameter/index_en.rst
@@ -2,10 +2,25 @@
 
 Set Command-line Parameters
 ===========================
+The implementation of deep learning algorithms has a variety of characteristics, such as running environment, running stage, structure of the model and the traning strategy. PaddlePaddle supports the user to set various command-line parameters flexibly, which helps to achieve control of the model training or prediction process.
+
+In this part, we take several actual scenarios as an example, and the use of some command-line parameters is displayed:
 
 ..  toctree::
   :maxdepth: 1
 
   use_case_en.md
+
+Then, we summarize and classify the use of all command-line parameters:
+
+..  toctree::
+  :maxdepth: 1
+
   arguments_en.md
+
+Finally, the detailed descriptions are given, and we try to explain the propeties and significance of these command-line parameters in detail:
+
+..  toctree::
+  :maxdepth: 1
+
   detail_introduction_en.md
diff --git a/doc/v2/howto/index_en.rst b/doc/v2/howto/index_en.rst
index 2079be766f2d8e6d63ca11dccd98f80613309ceb..35ef197f58f1f865e2cdbdebb567d5637284637a 100644
--- a/doc/v2/howto/index_en.rst
+++ b/doc/v2/howto/index_en.rst
@@ -1,11 +1,37 @@
 HOW TO
-=======
+========
+
+PaddlePaddle provides the users the ability to flexibly set various command line parameters to control the model training and inference process. Please refer to the following instructions on using PaddlePaddle:
 
 ..  toctree::
   :maxdepth: 1
 
   cmd_parameter/index_en.rst
+
+PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to:
+
+..  toctree::
+  :maxdepth: 1
+
   cluster/index_en.rst
+
+PaddlePaddle provides a C-API for inference. We provide the following guidelines  for using the C-API:
+
+..  toctree::
+  :maxdepth: 1
+
   capi/index_en.rst
+
+PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to:
+
+..  toctree::
+  :maxdepth: 1
+
   rnn/index_en.rst
+
+How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to:
+
+..  toctree::
+  :maxdepth: 1
+
   optimization/gpu_profiling_en.rst
diff --git a/doc/v2/howto/rnn/recurrent_group_en.md b/doc/v2/howto/rnn/recurrent_group_en.md
index d264b0a9f85faffd49c1982117cb5a3ac6ffc015..de6b60f29eb97029a54609cd2194bb7faf3ffec5 100644
--- a/doc/v2/howto/rnn/recurrent_group_en.md
+++ b/doc/v2/howto/rnn/recurrent_group_en.md
@@ -1,3 +1,96 @@
 # Recurrent Group Tutorial
 
-TBD
+## Overview
+
+Sequential data is common in natural language processing.
+
+A sentence is a sequence of words and many sentences form a paragraph further. Therefore, a paragraph can be viewed as a nested sequence with two level, where each element of the sequence is another sequence. That is to say, sequential data could be recursive. An example of two-level recursive sequential data is that an article is composed of a sequence of sentences, and each sentence a sequence of words.
+
+PaddlePaddle and PaddlePaddle v2 support two-level recursive sequential data. The two-level sequence is a very flexible data, which helps us to better describe more complex language data such as discribing paragraphs and several rounds of dialogues. Based on two-level sequence input, we can design and build a flexible, hierarchical RNN model that encodes input data from the word and sentence level. For the support of arbitrary levels, please refer to PaddlePaddle Fluid.
+
+In PaddlePaddle, `recurrent_group` is an arbitrarily complex RNN unit. The user only needs to define the calculation that the RNN will complete in one time step. PaddlePaddle is responsible for the propagation of information and error in time series.
+
+Furthermore, `recurrent_group` can also be extended to handle two-level sequence. By defining two nested `recurrent_group` operations at the clause level and the word level respectively, a hierarchical and complex RNN is finally achieved.
+
+Currently, in the PaddlePaddle, there are `recurrent_group` and some Layers that can process bidirectional sequences. For details, refer to the document: Layers for supporting double-layer sequences as input.
+
+## Related Concepts
+
+### Basic Principle 
+`recurrent_group` is an arbitrarily complex RNN unit supported by PaddlePaddle. The user only needs to focus on the calculations that the RNN is designed to complete within a single time step. The PaddlePaddle is responsible for completing the propagation of information and gradients over time.
+
+In PaddlePaddle, a simple call to `recurrent_group` is as follows:
+
+``` python 
+recurrent_group(step, input, reverse) 
+```
+- step: A callable function that defines the calculations completed by the RNN unit within a time step
+- input: The input must be a single-layer sequence or a double-layer sequence
+- reverse: Whether to process the input sequence in reverse order
+
+The core of using `recurrent_group` is to design the logic of the step function. The step function can be freely combined with various layers supported by PaddlePaddle to complete arbitrary arithmetic logic. The input of `recurrent_group` (input) becomes the input of the step function. Since the step function only focuses on the calculation within one time step of RNN, here `recurrent_group` completes the splitting of the original input data for us.
+
+### Input
+The input sequence processed by `recurrent_group` is mainly divided into the following three types:
+
+- **Input Data**: When putting a two-level sequence into `recurrent_group`, it will be disassembled into a single-level sequence. When putting a single-level sequence into `recurrent_group`, it will be disassembled into a non-sequence and then passed to the step function. This process is completely transparent to the user. There are two possible types: 1) User input via data_layer; 2) Output from other layers.
+		
+- **Read-only Memory Input**: `StaticInput` defines a read-only Memory. The input specified by `StaticInput` will not be disassembled by `recurrent_group`, and each time step of the `recurrent_group` loop will always be able to reference all inputs. It may be a non-sequence or a single-layer sequence.
+	  
+- **Input of Sequence Generation Task**: `GeneratedInput` is only used to specify input data in a sequence generation task.
+
+### Input Example
+
+Sequence generation tasks mostly follow the encoder-decoer architecture. The encoder and decoder can be arbitrary neural network units capable of processing sequences and RNN is the most popular choice.
+
+Given the encoder output and the current word, the decoder predicts the next most likely word each time. In this structure, the decoder accepts two inputs:
+
+- Target sequence to be generated: a input of the decoder and the basis of the decoder loop. `recurrent_group` will disassemble this input type.
+
+- Encoder output, an non-sequencce or single-sequence: a unbounded memory. Each time step in the decoder loop will reference the entire result and should not be disassembled. This type of input must be specified via `StaticInput`. For more discussion on Unbounded Memory, please refer to the paper [Neural Turning Machine](https://arxiv.org/abs/1410.5401).
+
+In a sequence generation task, the decoder RNN always refers to the word vector of the word predicted at the previous moment as the current time input. `GeneratedInput` will automate this process.
+
+### Output
+The `step` function must return the output of one or more Layers. The output of this Layer will be the final output of the entire `recurrent_group`. In the output process, `recurrent_group` will concatenate the output of each time step, which is also transparent to the user.
+
+### Memory
+Memory can only be defined and used in `recurrent_group`. Memory cannot exist independently and must point to a layer defined by PaddlePaddle. Memory is referenced to get a momentary output from this layer, so memory can be interpreted as a delay operation.
+
+The user can explicitly specify the output of a layer to initialize the memory. When not specified, memory is initialized to 0 by default.
+
+## Sequence-level RNN Introduction
+
+`recurrent_group` helps us to split the input sequence, merge the output, and loop through the sequence of computational logic.
+
+Using this feature, the two nested `recurrent_group` can handle the nested two-level sequences, implementing sequence-level RNN structures at both the word and sentence levels.
+
+- Word-level RNN:  each state corresponds to a word.
+- Sequence-level RNN: a sequence-layer RNN consists of multiple word-layer RNNs. Each word-layer RNN (ie, each state of a sequence-layer RNN) has a subsequence.
+
+For convenience of description, the following takes the NLP task as an example. A paragraph containing a subsequence is defined as a two-level sequence, and a sentence containing a word is defined as a single-layer sequence. Then, the zero-level sequence is a word.
+
+## Usage of Sequence-level RNN
+
+### Usage of Training Process
+Using `recurrent_group` requires the following conventions:
+
+- **Single-input Single-output**: Both input and output are single layer sequences.
+  - If there are multiple inputs, the number of words in different input sequences must be exactly equal.
+  - A single-layer sequence is output, and the number of words in the output sequence is the same as the input sequence.
+  - memory: define memory to point to a layer in the step function, get a moment output from this layer by referencing memory to form a recurrent connection. The is_seq parameter of memory must be false. If memory is not defined, the operations within each time step are independent.
+  - boot_layer: the initial state of memory, set 0 by default. is_seq in memory must be false.
+ 
+- **Double-input Double-output**: Both input and output are two-level sequence.
+  - If there are multiple input sequences, the number of subsequence contained in different inputs must be strictly equal, but the number of words in the subsequence may not be equal.
+  - output a two-level sequence. The number of subsequence and the number of words are the same as the specified input sequence and the first input is default.
+  - memory: defining memory in the step function, pointing to a layer, by referring to the memory to get the output of this layer at a time, forming a recurrent connection. The memory defined in the outer `recurrent_group` step function can record the state of the previous subsequence, either as a single-level sequence (only as read-only memory) or as a word. If memory is not defined, the operations between subsequence are independent.
+  - boot_layer: the initial state of memory. It is either a single-level sequence (only as read-only memory) or a vector. The default is not set, that is, the initial state is 0.
+
+- **Double-input Single-output**: not support for now, and output the error with "In hierachical RNN, all out links should be from sequences now".
+ 
+### Usage of Generation Process
+Using `beam_search` need follow those conventions: 
+
+- Word-level RNN: generate the next word from a word.
+- Sequence-level RNN: the single-layer RNN generated subsequence is concatenated into a new double-layer sequence. Semantically, there is no case where a subsequence generates the next subseq directly.
diff --git a/paddle/.gitignore b/paddle/.gitignore
index f921eef14156a97e4fd250f014960e306b43f35a..1c1c0c2c829f088d7e3f52ca007fcb8f33a16a36 100644
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@@ -1,3 +1,4 @@
+.timestamp
 *.o
 *.a
 .svn
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index d2a4b1335464f553a361728e64ed5ca177ca53da..c44f8a8a8ecc1ba1f886fc41aec863b4ca3458a6 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT WITH_FLUID)
+if(NOT WITH_FLUID_ONLY)
   add_subdirectory(cuda)
   add_subdirectory(function)
   add_subdirectory(utils)
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index cf84568ecdf1227b0d0ed3606a4a9a6e5186af72..06e1f5d5f0884efabfcdf917ca5c35d94ad5dce9 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -89,16 +89,17 @@ SWIG_LINK_LIBRARIES(swig_paddle
     ${START_END}
 )
 
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
-    COMMAND ${CMAKE_COMMAND} -E touch .timestamp
+add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/.timestamp
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
     DEPENDS _swig_paddle
 )
 
 # TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)
+add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so)
 
 if(WITH_TESTING)
     IF(NOT PY_PIP_FOUND)
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
index 761aeb5b174105edece8880a9f5012c13a63fd11..13cb79129cc2272d215cdb475fb146b37266699e 100644
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
@@ -1,3 +1,8 @@
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/testTrain.py
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/*.py ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_api_test ALL DEPENDS testTrain.py)
+
 py_test(testTrain SRCS testTrain.py)
 py_test(testMatrix SRCS testMatrix.py)
 py_test(testVector SRCS testVector.py)
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 63ec51564793ca2255032d0efbe2c47326f8b698..b790fa39fe863bbb00f6cd36d4c63481b7634fe1 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -370,4 +370,48 @@ extern void hl_maxout_backward(real* inGrad,
                                size_t featLen,
                                size_t groups);
 
+/**
+ * @brief   Upsample forward.
+ * @param[in]   inputData   input data.
+ * @param[out]  maskData    the mask data from MaxPoolWithMaskLayer.
+ * @param[out]  batchSize   the batch size of the input.
+ * @param[in]   imgSizeH    image height.
+ * @param[in]   imgSizeW    image width.
+ * @param[in]   channels    the input channels.
+ * @param[in]   outputH     the output height.
+ * @param[in]   outputW     the output widht.
+ * @param[out]  outputData  output data.
+ */
+extern void hl_upsample_forward(real* inputData,
+                                real* maskData,
+                                size_t batchSize,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW,
+                                real* outputData);
+
+/**
+ * @brief   Upsample backward.
+ * @param[in]   outputGradData  the output grad data.
+ * @param[out]  maskData    the mask data from MaxPoolWithMaskLayer.
+ * @param[out]  batchSize       the batch size of the input.
+ * @param[in]   imgSizeH        image height.
+ * @param[in]   imgSizeW        image width.
+ * @param[in]   channels        the input channels.
+ * @param[in]   outputH         the output height.
+ * @param[in]   outputW         the output widht.
+ * @param[out]  inputGradData   the input grad data.
+ */
+extern void hl_upsample_backward(real* outputGradData,
+                                 real* maskData,
+                                 size_t batchSize,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real* inputGradData);
+
 #endif  // HL_CNN_H_
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index c39bd3228d3f2ea7495cd21f5ff60bdfbbd2b51d..997eed62e07827f375c7441554b397fdd0bd6a80 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -224,4 +224,24 @@ inline void hl_maxout_backward(real* inGrad,
                                size_t featLen,
                                size_t group) {}
 
+inline void hl_upsample_forward(real* inputData,
+                                real* maskData,
+                                size_t batchSize,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW,
+                                real* outputData) {}
+
+inline void hl_upsample_backward(real* outputGradData,
+                                 real* maskData,
+                                 size_t batchSize,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real* inputGradData) {}
+
 #endif  // HL_CNN_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index a4459243e8a7c8be58be2255faf89e29817fbdf5..bac743a293cc97b114281e510d06367a86536452 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -1028,3 +1028,79 @@ void hl_maxout_backward(real* inGrad,
       num_kernels, inGrad, outGrad, idData, size, featLen, groups);
   CHECK_SYNC("hl_maxout_backward failed");
 }
+
+__global__ void upsampleForwardCompute(real* input_data,
+                                       real* mask_data,
+                                       size_t nthreads,
+                                       size_t in_h,
+                                       size_t in_w,
+                                       size_t out_h,
+                                       size_t out_w,
+                                       real* output_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int offset = index / (in_w * in_h) * out_h * out_w;
+    int upsample_idx = static_cast(mask_data[index]);
+    output_data[offset + upsample_idx] = input_data[index];
+  }
+}
+
+__global__ void upsampleBackwardCompute(real* out_grad,
+                                        real* mask_data,
+                                        size_t nthreads,
+                                        size_t in_h,
+                                        size_t in_w,
+                                        size_t out_h,
+                                        size_t out_w,
+                                        real* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int offset = index / (in_w * in_h) * out_h * out_w;
+    int upsample_idx = static_cast(mask_data[index]);
+    input_grad[index] = out_grad[offset + upsample_idx];
+  }
+}
+
+void hl_upsample_forward(real* inputData,
+                         real* maskData,
+                         size_t batchSize,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t channels,
+                         size_t outputH,
+                         size_t outputW,
+                         real* outputData) {
+  int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  upsampleForwardCompute<<>>(inputData,
+                                                              maskData,
+                                                              num_kernels,
+                                                              imgSizeH,
+                                                              imgSizeW,
+                                                              outputH,
+                                                              outputW,
+                                                              outputData);
+  CHECK_SYNC("hl_upsample_forward failed");
+}
+
+void hl_upsample_backward(real* outputGradData,
+                          real* maskData,
+                          size_t batchSize,
+                          size_t imgSizeH,
+                          size_t imgSizeW,
+                          size_t channels,
+                          size_t outputH,
+                          size_t outputW,
+                          real* inputGradData) {
+  int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  upsampleBackwardCompute<<>>(outputGradData,
+                                                               maskData,
+                                                               num_kernels,
+                                                               imgSizeH,
+                                                               imgSizeW,
+                                                               outputH,
+                                                               outputW,
+                                                               inputGradData);
+  CHECK_SYNC("hl_upsample_backward failed");
+}
diff --git a/paddle/fluid/framework/.clang-format b/paddle/fluid/.clang-format
similarity index 100%
rename from paddle/fluid/framework/.clang-format
rename to paddle/fluid/.clang-format
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index a4ea74a6d2fbc29dc33a6b57ee453f49ed36c7fa..3840bbe83b68dc2a49aa73feb57a80e9992cad5f 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(details)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
 
@@ -6,9 +7,9 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place memory device_context framework_proto)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place memory device_context framework_proto)
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -20,9 +21,9 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
-cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
 
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
@@ -73,8 +74,8 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 add_custom_command(TARGET framework_py_proto POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto
-    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+    COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
     COMMENT "Copy generated python proto into directory paddle/fluid/proto."
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
@@ -87,6 +88,9 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
 framework_proto backward glog lod_rank_table feed_fetch_method)
 
+
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
+
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index 3693bc25d81a8309df1a6ddf3d9b08d484596ea9..b8847e4b909cbab67b2ddb6885b45b73d402de19 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/block_desc.h"
+#include 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-#include 
-
 namespace paddle {
 namespace framework {
 
@@ -147,14 +146,6 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
   if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
     return;
   }
-  need_update_ = true;
-  for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
-    auto names = (*it)->InputArgumentNames();
-    for (auto n : names) {
-      // TODO(typhoonzero): delete vars if no other op use it.
-      VLOG(3) << "deleting var " << n;
-    }
-  }
   ops_.erase(ops_.begin() + s, ops_.begin() + e);
 }
 
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 185f018ac1b5863e0ee86fdaa17df1ccbc6e030e..873969b2a884f6d9e133fe87bf72725c36ce8b98 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -89,8 +90,15 @@ class BlockDesc {
 
   OpDesc *InsertOp(size_t index);
 
+  /*
+   * Remove Op and its input/output variables.
+   * Note that for either input or ouput variable, if it is also an input or
+   * output variable of other ops, we should remain it.
+   */
   void RemoveOp(size_t s, size_t e);
 
+  void RemoveVar(const std::string &name) { vars_.erase(name); }
+
   std::vector AllOps() const;
 
   size_t OpSize() const { return ops_.size(); }
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index adfaba26ace78f547161ad4029a741f3ca8a6764..722bf8e8ecba0c9cbc5e3ad737dbf73148d2873c 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include   // for size_t
-#include 
+#include             // for size_t
+#include   // NOLINT
 #include 
 #include "paddle/fluid/platform/enforce.h"
 
@@ -34,7 +34,7 @@ class Channel {
  public:
   virtual bool CanSend() = 0;
   virtual bool CanReceive() = 0;
-  virtual bool Send(T*) = 0;
+  virtual void Send(T*) = 0;
   virtual bool Receive(T*) = 0;
   virtual size_t Cap() = 0;
   virtual void Lock() = 0;
@@ -84,69 +84,81 @@ class ChannelHolder {
   }
 
   template 
-  bool Send(T* data) {
-    if (!IsInitialized()) return false;
-    PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T)));
+  void Send(T* data) {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    PADDLE_ENFORCE_EQ(
+        holder_->Type(), std::type_index(typeid(T)),
+        "Channel type is not same as the type of the data being sent");
     // Static cast should be safe because we have ensured that types are same
     Channel* channel = static_cast*>(holder_->Ptr());
-    return channel != nullptr ? channel->Send(data) : false;
+    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
+    channel->Send(data);
   }
 
   template 
   bool Receive(T* data) {
-    if (!IsInitialized()) return false;
-    PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T)));
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    PADDLE_ENFORCE_EQ(
+        holder_->Type(), std::type_index(typeid(T)),
+        "Channel type is not same as the type of the data being sent");
     Channel* channel = static_cast*>(holder_->Ptr());
-    return channel != nullptr ? channel->Receive(data) : false;
+    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
+    return channel->Receive(data);
   }
 
   bool IsClosed() {
-    if (IsInitialized()) {
-      return holder_->IsClosed();
-    }
-    return false;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->IsClosed();
   }
 
   bool CanSend() {
-    if (IsInitialized()) {
-      return holder_->CanSend();
-    }
-    return false;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->CanSend();
   }
 
   bool CanReceive() {
-    if (IsInitialized()) {
-      return holder_->CanReceive();
-    }
-    return false;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->CanReceive();
   }
 
   void close() {
-    if (IsInitialized()) holder_->Close();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Close();
   }
 
   size_t Cap() {
-    if (IsInitialized()) return holder_->Cap();
-    return -1;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->Cap();
   }
 
   void Lock() {
-    if (IsInitialized()) holder_->Lock();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Lock();
   }
 
   void Unlock() {
-    if (IsInitialized()) holder_->Unlock();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Unlock();
   }
 
   template 
   void AddToSendQ(const void* referrer, T* data,
                   std::shared_ptr cond,
                   std::function cb) {
-    if (IsInitialized()) {
-      Channel* channel = static_cast*>(holder_->Ptr());
-      if (channel != nullptr) {
-        channel->AddToSendQ(referrer, data, cond, cb);
-      }
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    Channel* channel = static_cast*>(holder_->Ptr());
+    if (channel != nullptr) {
+      channel->AddToSendQ(referrer, data, cond, cb);
     }
   }
 
@@ -154,26 +166,31 @@ class ChannelHolder {
   void AddToReceiveQ(const void* referrer, T* data,
                      std::shared_ptr cond,
                      std::function cb) {
-    if (IsInitialized()) {
-      Channel* channel = static_cast*>(holder_->Ptr());
-      if (channel != nullptr) {
-        channel->AddToReceiveQ(referrer, data, cond, cb);
-      }
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    Channel* channel = static_cast*>(holder_->Ptr());
+    if (channel != nullptr) {
+      channel->AddToReceiveQ(referrer, data, cond, cb);
     }
   }
 
   void RemoveFromSendQ(const void* referrer) {
-    if (IsInitialized()) holder_->RemoveFromSendQ(referrer);
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->RemoveFromSendQ(referrer);
   }
 
   void RemoveFromReceiveQ(const void* referrer) {
-    if (IsInitialized()) holder_->RemoveFromReceiveQ(referrer);
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->RemoveFromReceiveQ(referrer);
   }
 
   inline bool IsInitialized() const { return holder_ != nullptr; }
 
   inline const std::type_index Type() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true);
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
     return holder_->Type();
   }
 
@@ -199,7 +216,8 @@ class ChannelHolder {
 
   template 
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(size_t buffer_size) : type_(std::type_index(typeid(T))) {
+    explicit PlaceholderImpl(size_t buffer_size)
+        : type_(std::type_index(typeid(T))) {
       channel_.reset(MakeChannel(buffer_size));
     }
 
diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h
index 457abbf373d4549229e8fd8bd6b2087cc6b8f5c8..26d454534e1ae38c4f83376c0836a45781ea9101 100644
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include   // for size_t
 #include 
-#include 
+#include   // NOLINT
 #include 
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -31,14 +31,14 @@ class ChannelImpl : public paddle::framework::Channel {
  public:
   virtual bool CanSend();
   virtual bool CanReceive();
-  virtual bool Send(T *);
+  virtual void Send(T *);
   virtual bool Receive(T *);
   virtual size_t Cap() { return cap_; }
   virtual void Lock();
   virtual void Unlock();
   virtual bool IsClosed();
   virtual void Close();
-  ChannelImpl(size_t);
+  explicit ChannelImpl(size_t);
   virtual ~ChannelImpl();
 
   virtual void AddToSendQ(const void *referrer, T *data,
@@ -60,7 +60,7 @@ class ChannelImpl : public paddle::framework::Channel {
     const void *referrer;  // TODO(thuan): figure out better way to do this
     std::function callback;
 
-    QueueMessage(T *item)
+    explicit QueueMessage(T *item)
         : data(item), cond(std::make_shared()) {}
 
     QueueMessage(T *item, std::shared_ptr cond)
@@ -76,10 +76,9 @@ class ChannelImpl : public paddle::framework::Channel {
     }
   };
 
-  bool send_return(bool value) {
+  void send_return() {
     send_ctr--;
     destructor_cond_.notify_all();
-    return value;
   }
 
   bool recv_return(bool value) {
@@ -88,6 +87,21 @@ class ChannelImpl : public paddle::framework::Channel {
     return value;
   }
 
+  std::shared_ptr get_first_message(
+      std::deque> *queue, ChannelAction action) {
+    while (!queue->empty()) {
+      // Check whether this message was added by Select
+      // If this was added by Select then execute the callback
+      // to check if you can execute this message. The callback
+      // can return false if some other case was executed in Select.
+      // In that case just discard this QueueMessage and process next.
+      std::shared_ptr m = queue->front();
+      queue->pop_front();
+      if (m->callback == nullptr || m->callback(action)) return m;
+    }
+    return nullptr;
+  }
+
   size_t cap_;
   std::recursive_mutex mu_;
   bool closed_;
@@ -118,45 +132,33 @@ bool ChannelImpl::CanReceive() {
 }
 
 template 
-bool ChannelImpl::Send(T *item) {
+void ChannelImpl::Send(T *item) {
   send_ctr++;
   std::unique_lock lock{mu_};
 
-  // If channel is closed, do nothing
+  // If channel is closed, throw exception
   if (closed_) {
+    send_return();
     lock.unlock();
-    // TODO(abhinavarora) Should panic on closed channel
-    return send_return(false);
+    PADDLE_THROW("Cannot send on closed channel");
   }
 
   // If there is a receiver, directly pass the value we want
   // to send to the receiver, bypassing the channel buffer if any
   if (!recvq.empty()) {
-    std::shared_ptr m = recvq.front();
-    recvq.pop_front();
-    // Do the data transfer
-    // We will do this data transfer if either of the following
-    // cases are true
-    // 1. callback == nullptr // This means it was a regular channel send
-    // 2. callback returns true
-    bool do_send = true;
-    if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
-    if (do_send)
+    std::shared_ptr m =
+        get_first_message(&recvq, ChannelAction::SEND);
+
+    if (m != nullptr) {
       *(m->data) = std::move(*item);
-    else
-      // We cannot do the data transfer because
-      // this QueueMessage was added by Select
-      // and some other case was executed.
-      // So call the Send function again.
-      // We do not care about notifying other
-      // because they would have been notified
-      // by the executed select case.
-      return send_return(Send(item));
-
-    // Wake up the blocked process and unlock
-    m->Notify();
-    lock.unlock();
-    return send_return(true);
+      m->Notify();
+      send_return();
+      return;
+    } else {
+      Send(item);
+      send_return();
+      return;
+    }
   }
 
   // Unbuffered channel will always bypass this
@@ -165,9 +167,8 @@ bool ChannelImpl::Send(T *item) {
   if (buf_.size() < cap_) {
     // Copy to buffer
     buf_.push_back(std::move(*item));
-    // Release lock and return true
-    lock.unlock();
-    return send_return(true);
+    send_return();
+    return;
   }
 
   // Block on channel, because some receiver will complete
@@ -175,8 +176,12 @@ bool ChannelImpl::Send(T *item) {
   auto m = std::make_shared(item);
   sendq.push_back(m);
   m->Wait(lock);
-  // TODO(abhinavarora) Should panic on closed channel
-  return send_return(!m->chan_closed);
+  if (m->chan_closed) {
+    send_return();
+    lock.unlock();
+    PADDLE_THROW("Cannot send on closed channel");
+  }
+  send_return();
 }
 
 template 
@@ -186,39 +191,38 @@ bool ChannelImpl::Receive(T *item) {
 
   // If channel is closed and buffer is empty or
   // channel is unbuffered
-  if (closed_ && buf_.empty()) {
-    lock.unlock();
-    return recv_return(false);
-  }
+  if (closed_ && buf_.empty()) return recv_return(false);
 
   // If there is a sender, directly receive the value we want
-  // from the sender, bypassing the channel buffer if any
+  // from the sender. In case of a buffered channel, read from
+  // buffer and move front of send queue to the buffer
   if (!sendq.empty()) {
-    std::shared_ptr m = sendq.front();
-    sendq.pop_front();
-    // Do the data transfer
-    // We will do this data transfer if either of the following
-    // cases are true
-    // 1. callback == nullptr // This means it was a regular channel send
-    // 2. callback returns true
-    bool do_receive = true;
-    if (m->callback != nullptr)
-      do_receive = m->callback(ChannelAction::RECEIVE);
-    if (do_receive)
-      *item = std::move(*(m->data));
-    else
-      // We cannot do the data transfer because
-      // this QueueMessage was added by Select
-      // and some other case was executed.
-      // So call the Receive function again.
-      // We do not care about notifying other
-      // because they would have been notified
-      // by the executed select case.
-      return recv_return(Receive(item));
-
-    // Wake up the blocked process and unlock
-    m->Notify();
-    lock.unlock();
+    std::shared_ptr m =
+        get_first_message(&sendq, ChannelAction::RECEIVE);
+    if (buf_.size() > 0) {
+      // Case 1 : Channel is Buffered
+      // Do Data transfer from front of buffer
+      // and add a QueueMessage to the buffer
+      *item = std::move(buf_.front());
+      buf_.pop_front();
+      // If first message from sendq is not null
+      // add it to the buffer and notify it
+      if (m != nullptr) {
+        // Copy to buffer
+        buf_.push_back(std::move(*(m->data)));
+        m->Notify();
+      }  // Ignore if there is no first message
+    } else {
+      // Case 2: Channel is Unbuffered
+      // Do data transfer from front of SendQ
+      // If front is nullptr, then recursively call itself
+      if (m != nullptr) {
+        *item = std::move(*(m->data));
+        m->Notify();
+      } else {
+        return recv_return(Receive(item));
+      }
+    }
     return recv_return(true);
   }
 
@@ -227,8 +231,7 @@ bool ChannelImpl::Receive(T *item) {
     // Directly read from buffer
     *item = std::move(buf_.front());
     buf_.pop_front();
-    // Release lock and return true
-    lock.unlock();
+    // return true
     return recv_return(true);
   }
 
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
index 73be5cdbe2a1f5994ecee4c415e83962f50532fe..542d791f6bbdf7d68a4786998ccc0233fff6473d 100644
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -14,9 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/channel.h"
 
-#include 
-#include 
-
+#include   // NOLINT
+#include   // NOLINT
 #include "gtest/gtest.h"
 
 using paddle::framework::Channel;
@@ -37,23 +36,25 @@ TEST(Channel, ChannelCapacityTest) {
   delete ch;
 }
 
-void RecevingOrderEqualToSendingOrder(Channel *ch) {
+void RecevingOrderEqualToSendingOrder(Channel *ch, int num_items) {
   unsigned sum_send = 0;
   std::thread t([&]() {
-    for (int i = 0; i < 5; i++) {
-      EXPECT_EQ(ch->Send(&i), true);
+    for (int i = 0; i < num_items; i++) {
+      ch->Send(&i);
       sum_send += i;
     }
   });
-  for (int i = 0; i < 5; i++) {
-    int recv = 999;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));
+  for (int i = 0; i < num_items; i++) {
+    int recv = -1;
     EXPECT_EQ(ch->Receive(&recv), true);
     EXPECT_EQ(recv, i);
   }
   std::this_thread::sleep_for(std::chrono::milliseconds(200));
   CloseChannel(ch);
   t.join();
-  EXPECT_EQ(sum_send, 10U);
+  unsigned expected_sum = (num_items * (num_items - 1)) / 2;
+  EXPECT_EQ(sum_send, expected_sum);
   delete ch;
 }
 
@@ -61,7 +62,7 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
   const size_t buffer_size = 10;
   auto ch = MakeChannel(buffer_size);
   for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Send(&i), true);  // should not block
+    ch->Send(&i);
   }
 
   size_t out;
@@ -82,7 +83,7 @@ void SendReceiveWithACloseChannelShouldPanic(Channel *ch) {
   const size_t data = 5;
   std::thread send_thread{[&]() {
     size_t i = data;
-    EXPECT_EQ(ch->Send(&i), true);  // should not block
+    ch->Send(&i);  // should not block
   }};
 
   std::thread recv_thread{[&]() {
@@ -94,12 +95,18 @@ void SendReceiveWithACloseChannelShouldPanic(Channel *ch) {
   send_thread.join();
   recv_thread.join();
 
-  // After closing send should return false. Receive should
-  // also return false as there is no data in queue.
+  // After closing send should panic. Receive should
+  // also  false as there is no data in queue.
   CloseChannel(ch);
   send_thread = std::thread{[&]() {
     size_t i = data;
-    EXPECT_EQ(ch->Send(&i), false);  // should return false
+    bool is_exception = false;
+    try {
+      ch->Send(&i);
+    } catch (paddle::platform::EnforceNotMet e) {
+      is_exception = true;
+    }
+    EXPECT_EQ(is_exception, true);
   }};
   recv_thread = std::thread{[&]() {
     size_t i;
@@ -129,7 +136,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
   auto ch = MakeChannel(buffer_size);
 
   for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Send(&i), true);  // sending should not block
+    ch->Send(&i);  // sending should not block
   }
 
   size_t out;
@@ -159,10 +166,17 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
   std::thread t([&]() {
     // Try to write more than buffer size.
     for (size_t i = 0; i < 2 * buffer_size; ++i) {
-      if (i < buffer_size)
-        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
-      else
-        EXPECT_EQ(ch->Send(&i), false);
+      if (i < buffer_size) {
+        ch->Send(&i);  // should block after 10 iterations
+      } else {
+        bool is_exception = false;
+        try {
+          ch->Send(&i);
+        } catch (paddle::platform::EnforceNotMet e) {
+          is_exception = true;
+        }
+        EXPECT_EQ(is_exception, true);
+      }
     }
   });
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
@@ -173,21 +187,37 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
 
 TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
   auto ch = MakeChannel(0);
-  RecevingOrderEqualToSendingOrder(ch);
+  RecevingOrderEqualToSendingOrder(ch, 20);
+}
+
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel1) {
+  // Test that Receive Order is same as Send Order when number of items
+  // sent is less than size of buffer
+  auto ch = MakeChannel(10);
+  RecevingOrderEqualToSendingOrder(ch, 5);
 }
 
-TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) {
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel2) {
+  // Test that Receive Order is same as Send Order when number of items
+  // sent is equal to size of buffer
   auto ch = MakeChannel(10);
-  RecevingOrderEqualToSendingOrder(ch);
+  RecevingOrderEqualToSendingOrder(ch, 10);
+}
+
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
+  // Test that Receive Order is same as Send Order when number of items
+  // sent is greater than the size of buffer
+  auto ch = MakeChannel(10);
+  RecevingOrderEqualToSendingOrder(ch, 20);
 }
 
 void ChannelCloseUnblocksReceiversTest(Channel *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -200,7 +230,7 @@ void ChannelCloseUnblocksReceiversTest(Channel *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
 
@@ -211,27 +241,33 @@ void ChannelCloseUnblocksReceiversTest(Channel *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 void ChannelCloseUnblocksSendersTest(Channel *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -241,13 +277,13 @@ void ChannelCloseUnblocksSendersTest(Channel *ch, bool isBuffered) {
   if (isBuffered) {
     // If ch is Buffered, atleast 4 threads must be blocked.
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (!thread_ended[i]) ct++;
     }
     EXPECT_GE(ct, 4);
   } else {
     // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -258,21 +294,21 @@ void ChannelCloseUnblocksSendersTest(Channel *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   if (isBuffered) {
     // Verify that only 1 send was successful
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (send_success[i]) ct++;
     }
     // Only 1 send must be successful
     EXPECT_EQ(ct, 1);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that closing a buffered channel also unblocks
@@ -316,8 +352,11 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
     // Try to send more number of times
     // than receivers
     for (int i = 0; i < 4; i++) {
-      ch->Send(&i);
-      sum_send += i;
+      try {
+        ch->Send(&i);
+        sum_send += i;
+      } catch (paddle::platform::EnforceNotMet e) {
+      }
     }
   });
   for (int i = 0; i < 3; i++) {
@@ -370,19 +409,25 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
 // This tests that destroying a channel unblocks
 //  any senders waiting for channel to have write space
 void ChannelDestroyUnblockSenders(Channel *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -393,14 +438,14 @@ void ChannelDestroyUnblockSenders(Channel *ch, bool isBuffered) {
   if (isBuffered) {
     // If channel is buffered, verify that atleast 4 threads are blocked
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (thread_ended[i] == false) ct++;
     }
     // Atleast 4 threads must be blocked
     EXPECT_GE(ct, 4);
   } else {
     // Verify that all the threads are blocked
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -409,13 +454,13 @@ void ChannelDestroyUnblockSenders(Channel *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   // Count number of successful sends
   int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     if (send_success[i]) ct++;
   }
 
@@ -428,18 +473,18 @@ void ChannelDestroyUnblockSenders(Channel *ch, bool isBuffered) {
   }
 
   // Join all threads
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that destroying a channel also unblocks
 //  any receivers waiting on the channel
 void ChannelDestroyUnblockReceivers(Channel *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -453,18 +498,18 @@ void ChannelDestroyUnblockReceivers(Channel *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
 
   // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
   // delete the channel
   delete ch;
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
@@ -508,7 +553,7 @@ void ChannelHolderSendReceive(ChannelHolder *ch) {
   unsigned sum_send = 0;
   std::thread t([&]() {
     for (int i = 0; i < 5; i++) {
-      EXPECT_EQ(ch->Send(&i), true);
+      ch->Send(&i);
       sum_send += i;
     }
   });
@@ -541,8 +586,22 @@ TEST(ChannelHolder, ChannelUninitializedTest) {
   ChannelHolder *ch = new ChannelHolder();
   EXPECT_EQ(ch->IsInitialized(), false);
   int i = 10;
-  EXPECT_EQ(ch->Send(&i), false);
-  EXPECT_EQ(ch->Receive(&i), false);
+  bool send_exception = false;
+  try {
+    ch->Send(&i);
+  } catch (paddle::platform::EnforceNotMet e) {
+    send_exception = true;
+  }
+  EXPECT_EQ(send_exception, true);
+
+  bool recv_exception = false;
+  try {
+    ch->Receive(&i);
+  } catch (paddle::platform::EnforceNotMet e) {
+    recv_exception = true;
+  }
+  EXPECT_EQ(recv_exception, true);
+
   bool is_exception = false;
   try {
     ch->Type();
@@ -620,12 +679,12 @@ TEST(ChannelHolder, TypeMismatchReceiveTest) {
 }
 
 void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -638,7 +697,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
 
@@ -649,27 +708,33 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -679,13 +744,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
   if (isBuffered) {
     // If ch is Buffered, atleast 4 threads must be blocked.
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (!thread_ended[i]) ct++;
     }
     EXPECT_GE(ct, 4);
   } else {
     // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -696,21 +761,21 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   if (isBuffered) {
     // Verify that only 1 send was successful
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (send_success[i]) ct++;
     }
     // Only 1 send must be successful
     EXPECT_EQ(ct, 1);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that closing a channelholder unblocks
@@ -748,19 +813,25 @@ TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
 // This tests that destroying a channelholder unblocks
 //  any senders waiting for channel
 void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -770,14 +841,14 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
   if (isBuffered) {
     // If channel is buffered, verify that atleast 4 threads are blocked
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (thread_ended[i] == false) ct++;
     }
     // Atleast 4 threads must be blocked
     EXPECT_GE(ct, 4);
   } else {
     // Verify that all the threads are blocked
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -786,13 +857,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   // Count number of successfuld sends
   int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     if (send_success[i]) ct++;
   }
 
@@ -805,18 +876,18 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
   }
 
   // Join all threads
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that destroying a channelholder also unblocks
 //  any receivers waiting on the channel
 void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -830,18 +901,18 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
   // delete the channel
   delete ch;
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
@@ -874,12 +945,12 @@ TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
 
 // This tests that closing a channelholder many times.
 void ChannelHolderManyTimesClose(ChannelHolder *ch) {
-  const int num_threads = 15;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const int kNumThreads = 15;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to send data to channel.
-  for (size_t i = 0; i < num_threads / 3; i++) {
+  for (size_t i = 0; i < kNumThreads / 3; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *ended) {
@@ -891,7 +962,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
   }
 
   // Launches threads that try to receive data to channel.
-  for (size_t i = num_threads / 3; i < 2 * num_threads / 3; i++) {
+  for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -905,7 +976,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
   }
 
   // Launches threads that try to close the channel.
-  for (size_t i = 2 * num_threads / 3; i < num_threads; i++) {
+  for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -920,13 +991,13 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
 
   // Verify that all threads are unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
   EXPECT_TRUE(ch->IsClosed());
   // delete the channel
   delete ch;
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {
diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc
index 25152054eb8452a9667bd65b4441665476c1d46d..e98e9d94bf71fe9ac226ab3ad7f587b37a5c6e33 100644
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
@@ -150,8 +150,9 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
   // Select block
   AddOp("select", {{"X", {dataChanName, quitChanName}},
                    {"case_to_execute", {"caseToExecute"}}},
-        {}, {{"sub_block", casesBlock},
-             {"cases", std::vector{case0Config, case1Config}}},
+        {{"Out", {}}},
+        {{"sub_block", casesBlock},
+         {"cases", std::vector{case0Config, case1Config}}},
         whileBlock);
 
   scope->Var("stepScopes");
@@ -209,9 +210,8 @@ TEST(Concurrency, Go_Op) {
 
   executor.Run(program, &scope, 0, true, true);
 
-  // After we call executor.run, the Go operator should do a channel_send to set
-  // the
-  // "result" variable to 99
+  // After we call executor.run, the Go operator should do a channel_send to
+  // set the "result" variable to 99.
   auto *finalData = tensor.data();
   EXPECT_EQ(finalData[0], 99);
 }
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index e896a06162527ed0289767901f4b4a33fcd2875f..a66525303da58601f85c40c41854edaf22c3d4ea 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -105,7 +105,7 @@ static void BuildVar(const std::string& param_name,
 TEST(Operator, CPUtoGPU) {
   using namespace paddle::framework;
   using namespace paddle::platform;
-  InitDevices();
+  InitDevices(true);
 
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace cpu_place;
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..85b649b2937f6a281b9ee1fe7bae8101169f6102
--- /dev/null
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -0,0 +1,22 @@
+cc_library(var_handle SRCS var_handle.cc DEPS place)
+cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
+cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+        dynload_cuda)
+cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
+
+cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
+cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+
+if(WITH_GPU)
+    set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
+else()
+    set(multi_devices_graph_builder_deps)
+endif()
+cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
+            scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
+cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
+        simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7a1b40c0b60a788b1f0a70e688f8fcbe427ad076
--- /dev/null
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -0,0 +1,42 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+                                         platform::Place place)
+    : op_(framework::OpRegistry::CreateOp(op_desc)),
+      scope_(scope),
+      place_(place) {}
+
+void ComputationOpHandle::RunImpl() {
+  auto *cur_ctx = dev_ctxes_[place_];
+  for (auto *in : inputs_) {
+    bool need_wait =
+        in->generated_op_ && in->generated_op_->dev_ctxes_[place_] != cur_ctx;
+    if (need_wait) {
+      in->generated_op_->Wait(cur_ctx);
+    }
+  }
+
+  op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get(), place_);
+}
+
+std::string ComputationOpHandle::Name() const { return op_->Type(); }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6d2d731ca80a0fbc0a2a34027b5b7c3c1977c07
--- /dev/null
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -0,0 +1,41 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+struct ComputationOpHandle : public OpHandleBase {
+  std::unique_ptr op_;
+  Scope *scope_;
+  platform::Place place_;
+
+  ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+                      platform::Place place);
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9180903b864d03e59f55f41410b2240fa4199496
--- /dev/null
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -0,0 +1,79 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fetch_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+FetchOpHandle::FetchOpHandle(FeedFetchList *data, size_t offset,
+                             std::vector *local_scopes)
+    : data_(data), offset_(offset), local_scopes_(local_scopes) {}
+
+FetchOpHandle::~FetchOpHandle() {
+  for (auto *input_var : inputs_) {
+    input_var->pending_ops_.erase(this);
+  }
+}
+
+void FetchOpHandle::Wait(platform::DeviceContext *waited_dev) {
+  PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
+}
+
+void FetchOpHandle::WaitAndMergeCPUTensors() const {
+  std::vector tensors_ptr;
+  tensors_ptr.reserve(tensors_.size());
+  for (auto &t : tensors_) {
+    tensors_ptr.emplace_back(&t);
+  }
+  data_->at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace());
+}
+
+void FetchOpHandle::RunImpl() {
+  auto cpu_ctx =
+      platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+  for (auto *input : inputs_) {
+    auto *var = static_cast(input);
+    var->generated_op_->Wait(cpu_ctx);
+  }
+
+  tensors_.resize(inputs_.size());
+  auto *var = static_cast(inputs_[0]);
+  auto &var_name = var->name_;
+  platform::CPUPlace cpu;
+  auto &scopes = *local_scopes_;
+
+  for (size_t i = 0; i < scopes.size(); ++i) {
+    auto &scope = scopes[i];
+    auto &t = scope->FindVar(var_name)->Get();
+    if (platform::is_gpu_place(var->place_)) {
+#ifdef PADDLE_WITH_CUDA
+      TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
+      dev_ctxes_[t.place()]->Wait();
+#endif
+    } else {
+      tensors_[i].ShareDataWith(t);
+      tensors_[i].set_lod(t.lod());
+    }
+  }
+
+  this->WaitAndMergeCPUTensors();
+}
+
+std::string FetchOpHandle::Name() const { return "Fetch"; }
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..904b2d669f8b156b99197afb0155380d1170a68b
--- /dev/null
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -0,0 +1,49 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct FetchOpHandle : public OpHandleBase {
+  FeedFetchList *data_;
+  size_t offset_;
+  std::vector *local_scopes_;
+  std::vector tensors_;
+
+  FetchOpHandle(FeedFetchList *data, size_t offset,
+                std::vector *local_scopes);
+
+  ~FetchOpHandle();
+
+  void Wait(platform::DeviceContext *waited_dev) override;
+
+  void WaitAndMergeCPUTensors() const;
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0dd9e6068174a4b0348d503f4082bee6ff68dac
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -0,0 +1,217 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/details/send_op_handle.h"
+#include "paddle/fluid/framework/scope.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
+#endif
+
+#include 
+#include 
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+#ifdef PADDLE_WITH_CUDA
+MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
+    const std::vector &places,
+    const std::string &loss_var_name,
+    const std::unordered_set ¶ms,
+    const std::vector &local_scopes,
+    platform::NCCLContextMap *nccl_ctxs)
+    : loss_var_name_(loss_var_name),
+      places_(places),
+      local_scopes_(local_scopes),
+      nccl_ctxs_(nccl_ctxs) {
+#else
+MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
+    const std::vector &places,
+    const std::string &loss_var_name,
+    const std::unordered_set ¶ms,
+    const std::vector &local_scopes)
+    : loss_var_name_(loss_var_name),
+      places_(places),
+      local_scopes_(local_scopes) {
+#endif
+  for (auto &p : params) {
+    grad_names_.insert(GradVarName(p));
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
+                                                const platform::Place &p,
+                                                const size_t &i) const {
+  auto *op_handle = result->ops_.back().get();
+  op_handle->dev_ctxes_[p] = const_cast(
+      platform::DeviceContextPool::Instance().Get(p));
+
+  auto var_names = op->InputArgumentNames();
+
+  for (auto &each_var_name : var_names) {
+    VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
+    op_handle->AddInput(var);
+  }
+
+  var_names = op->OutputArgumentNames();
+
+  for (auto &each_var_name : var_names) {
+    CreateOpOutput(result, op_handle, each_var_name, p, i);
+  }
+}
+
+std::unique_ptr MultiDevSSAGraphBuilder::Build(
+    const ProgramDesc &program) const {
+  auto graph = new SSAGraph();
+  SSAGraph &result = *graph;
+  std::unordered_set og_has_been_broadcast;
+
+  // We cannot invoke resize. It is a bug of GCC 4.8
+  result.vars_ = std::vector<
+      std::unordered_map>>>(
+      places_.size());
+
+  bool is_forwarding = true;
+  for (auto *op : program.Block(0).AllOps()) {
+    bool change_forward = false;
+    if (!is_forwarding) {
+      // FIXME(yy): Do not hard code like this
+      if (op->OutputArgumentNames().size() == 1 &&
+          op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) {
+        continue;  // Drop fill 1. for backward coeff;
+      }
+    }
+
+    // append send op if program is distributed trainer main program.
+    // always use the first device
+    if (!is_forwarding && op->Type() == "send") {
+      auto &p = places_[0];
+      auto *s = local_scopes_[0];
+      // FIXME(wuyi): send op always copy from GPU 0
+      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
+      // Create inputs for output on original place and no ssa output
+      // is created for send op.
+      CreateOpHandleIOs(&result, op, p, 0);
+      continue;
+    }
+
+    for (size_t i = 0; i < places_.size(); ++i) {
+      auto &p = places_[i];
+      auto *s = local_scopes_[i];
+
+      result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
+      auto *op_handle = result.ops_.back().get();
+      CreateOpHandleIOs(&result, op, p, i);
+
+      auto var_names = op->OutputArgumentNames();
+
+      if (is_forwarding) {
+        if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
+// Insert ScaleCost OpHandle
+#ifdef PADDLE_WITH_CUDA
+          auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
+#else
+          auto *communication_dev_ctx =
+              platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+#endif
+
+          op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
+                                                communication_dev_ctx);
+          result.ops_.emplace_back(op_handle);
+
+          // FIXME: Currently ScaleLossGradOp only use device_count as scale
+          // factor. So it does not depend on any other operators.
+          // VarHandle *loss = GetVarHandle(loss_var_name, place);
+          // loss->pending_ops_.emplace_back(op_handle);
+          // op_handle->inputs_.emplace_back(loss);
+
+          CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i);
+          change_forward = true;
+        }
+      }
+    }
+
+    if (change_forward) {
+      is_forwarding = false;
+    }
+
+    if (!is_forwarding) {
+      auto var_names = op->OutputArgumentNames();
+      // Currently, we assume that once gradient is generated, it can be
+      // broadcast, and each gradient is only broadcast once. But there are no
+      // other cases, for example, we need to adjust the gradient according to
+      // the input when we get the gradient, which is not considered at present.
+      for (auto &og : var_names) {
+        if (grad_names_.count(og) != 0 &&
+            og_has_been_broadcast.count(og) == 0) {  // is param grad
+                                                     // Insert NCCL AllReduce Op
+          og_has_been_broadcast.insert(og);
+#ifdef PADDLE_WITH_CUDA
+          result.ops_.emplace_back(
+              new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
+          auto *op_handle = result.ops_.back().get();
+
+          for (size_t i = 0; i < places_.size(); ++i) {
+            auto &p = places_[i];
+            auto &vars = result.vars_[i][og];
+
+            if (vars.empty()) {  // This device has no data. continue.
+              continue;
+            }
+            auto &prev_grad = vars[vars.size() - 1];
+            op_handle->AddInput(prev_grad.get());
+
+            vars.emplace_back(new VarHandle);
+            auto &var = vars.back();
+            var->place_ = p;
+            var->name_ = og;
+            var->version_ = vars.size() - 1;
+
+            op_handle->AddOutput(var.get());
+          }
+#else
+          PADDLE_ENFORCE("Not implemented");
+#endif
+        }
+      }
+    }
+  }
+
+  /*
+    Dependency graph has been constructed. However, there are still data
+    harzaeds need to be handled.
+   */
+  PolishGraphToSupportDataHazards(&result);
+
+  /*
+   * Only variables should be the leaves of graph.
+   */
+  AddOutputToLeafOps(&result);
+
+  if (VLOG_IS_ON(10)) {
+    std::ostringstream sout;
+    PrintGraphviz(*graph, sout);
+    VLOG(10) << sout.str();
+  }
+
+  return std::unique_ptr(graph);
+}  // namespace details
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..de34caab1be85eecb741a5003f026eb982e178ea
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -0,0 +1,63 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include 
+#include 
+
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
+namespace paddle {
+namespace platform {
+class NCCLContextMap;
+}
+
+namespace framework {
+class Scope;
+namespace details {
+class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
+ public:
+#ifdef PADDLE_WITH_CUDA
+  MultiDevSSAGraphBuilder(const std::vector &places,
+                          const std::string &loss_var_name,
+                          const std::unordered_set ¶ms,
+                          const std::vector &local_scopes,
+                          platform::NCCLContextMap *nccl_ctxs);
+#else
+  MultiDevSSAGraphBuilder(const std::vector &places,
+                          const std::string &loss_var_name,
+                          const std::unordered_set ¶ms,
+                          const std::vector &local_scopes);
+#endif
+
+  std::unique_ptr Build(const ProgramDesc &program) const override;
+
+ private:
+  void CreateOpHandleIOs(SSAGraph *result, OpDesc *op, const platform::Place &p,
+                         const size_t &i) const;
+
+ private:
+  std::string loss_var_name_;
+  const std::vector &places_;
+  const std::vector &local_scopes_;
+  std::unordered_set grad_names_;
+
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap *nccl_ctxs_;
+#endif
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55b5f113589e090386d287e228349f22fb94a7ab
--- /dev/null
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -0,0 +1,82 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
+    const std::vector &local_scopes,
+    const std::vector &places,
+    const platform::NCCLContextMap &ctxs)
+    : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
+  for (auto &p : places_) {
+    this->dev_ctxes_[p] = nccl_ctxs_.DevCtx(p);
+  }
+}
+
+void NCCLAllReduceOpHandle::RunImpl() {
+  if (inputs_.size() == 1) {
+    return;  // No need to all reduce when GPU count = 1;
+  } else {
+    // Wait input done
+    for (auto *in : inputs_) {
+      auto &p = static_cast(in)->place_;
+      in->generated_op_->Wait(dev_ctxes_[p]);
+    }
+
+    auto &var_name = static_cast(this->inputs_[0])->name_;
+    int dtype = -1;
+    size_t numel = 0;
+
+    std::vector> all_reduce_calls;
+
+    for (size_t i = 0; i < local_scopes_.size(); ++i) {
+      auto &p = places_[i];
+      auto *s = local_scopes_[i];
+      int dev_id = boost::get(p).device;
+
+      auto &lod_tensor = s->FindVar(var_name)->Get();
+      void *buffer = const_cast(lod_tensor.data());
+
+      if (dtype == -1) {
+        dtype = platform::ToNCCLDataType(lod_tensor.type());
+      }
+
+      if (numel == 0) {
+        numel = static_cast(lod_tensor.numel());
+      }
+
+      auto &nccl_ctx = nccl_ctxs_.at(dev_id);
+      auto stream = nccl_ctx.stream();
+      auto comm = nccl_ctx.comm_;
+      all_reduce_calls.emplace_back([=] {
+        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+            buffer, buffer, numel, static_cast(dtype), ncclSum,
+            comm, stream));
+      });
+    }
+
+    platform::NCCLGroupGuard guard;
+    for (auto &call : all_reduce_calls) {
+      call();
+    }
+  }
+}
+
+std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad14a3c5cb4625fa121cad2daed389c441e78771
--- /dev/null
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -0,0 +1,50 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include 
+#include 
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct NCCLAllReduceOpHandle : public OpHandleBase {
+  const std::vector &local_scopes_;
+  const std::vector &places_;
+  const platform::NCCLContextMap &nccl_ctxs_;
+
+  NCCLAllReduceOpHandle(const std::vector &local_scopes,
+                        const std::vector &places,
+                        const platform::NCCLContextMap &ctxs);
+
+  std::string Name() const override;
+
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
+  bool IsMultiDeviceTransfer() override { return true; };
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e4194a7442f677ec8970dbc387bb01ebbbf579f1
--- /dev/null
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -0,0 +1,102 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+std::string OpHandleBase::DebugString() const {
+  std::stringstream ss;
+  ss << "(";
+  for (auto *var : inputs_) {
+    ss << var->DebugString() << ", ";
+  }
+  ss << ") --> (";
+  for (auto *var : outputs_) {
+    ss << var->DebugString() << ", ";
+  }
+  ss << ")\n";
+  return ss.str();
+}
+
+OpHandleBase::~OpHandleBase() {
+#ifdef PADDLE_WITH_CUDA
+  for (auto &ev : events_) {
+    PADDLE_ENFORCE(cudaEventDestroy(ev.second));
+  }
+#endif
+}
+
+void OpHandleBase::Run(bool use_event) {
+#ifdef PADDLE_WITH_CUDA
+  if (events_.empty() && use_event) {
+    for (auto &p : dev_ctxes_) {
+      int dev_id = boost::get(p.first).device;
+      PADDLE_ENFORCE(cudaSetDevice(dev_id));
+      PADDLE_ENFORCE(
+          cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
+    }
+  }
+#else
+  PADDLE_ENFORCE(!use_event);
+#endif
+
+  RunImpl();
+
+#ifdef PADDLE_WITH_CUDA
+  if (use_event) {
+    for (auto &p : dev_ctxes_) {
+      int dev_id = boost::get(p.first).device;
+      auto stream =
+          static_cast(p.second)->stream();
+      PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream));
+    }
+  }
+#endif
+}
+
+void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) {
+    for (auto &dev_ctx : dev_ctxes_) {
+      dev_ctx.second->Wait();
+    }
+  } else {
+    auto stream =
+        static_cast(waited_dev)->stream();
+    for (auto &ev : events_) {
+      PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
+    }
+  }
+#else
+  for (auto &dev_ctx : dev_ctxes_) {
+    dev_ctx.second->Wait();
+  }
+#endif
+}
+
+void OpHandleBase::AddInput(VarHandleBase *in) {
+  this->inputs_.emplace_back(in);
+  in->pending_ops_.insert(this);
+}
+
+void OpHandleBase::AddOutput(VarHandleBase *out) {
+  outputs_.emplace_back(out);
+  out->generated_op_ = this;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7a541ac4bb83625060db337446d03a1afda3ed0
--- /dev/null
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -0,0 +1,68 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include 
+#include 
+
+#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class OpHandleBase {
+ private:
+  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
+
+ public:
+  std::vector inputs_;
+  std::vector outputs_;
+  std::unordered_map