Merge branch 'develop' of upstream into argsort_dev

28a0ac53 · Yibing Liu · 98460c00 · 88fa9c2e · 28a0ac53 · 28a0ac53
316 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,12 +55,13 @@ option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
-option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
+option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
+option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -147,7 +148,16 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
-include(external/grpc)
+if(WITH_DISTRIBUTE)
+    if(WITH_GRPC)
+        include(external/grpc)
+    else()
+        include(external/leveldb)
+        include(external/brpc)
+    endif()
+endif()
 include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)

--- a/Dockerfile
+++ b/Dockerfile
@@ -24,7 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y --allow-downgrades \
-    git python-pip python-dev openssh-server bison \
+    git python-pip python-dev python-opencv openssh-server bison \
    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
@@ -76,8 +76,7 @@ RUN easy_install -U pip && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark
 RUN pip install pre-commit 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
-    pip install opencv-python
 #For docstring checker
 RUN pip install pylint pytest astroid isort

--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl
 caffe/image/logs
 tensorflow/image/logs
 tensorflow/rnn/logs
+fluid/models/*.pyc
+fluid/logs
+fluid/nohup.out
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
 FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
-RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop
+RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
 RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
 RUN pip install -U pip
-RUN pip install -U kubernetes opencv-python paddlepaddle
+RUN pip install -U kubernetes paddlepaddle
 # IMPORTANT:
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
@@ -19,4 +19,4 @@ ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py dataset.py models/ /workspace/
+ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -24,14 +24,18 @@ Currently supported `--model` argument include:
 * Run the following command to start a benchmark job locally:
    ```bash
-      python fluid_benchmark.py --model mnist  --device GPU
+      python fluid_benchmark.py --model mnist --device GPU
    ```
    You can choose to use GPU/CPU training. With GPU training, you can specify
    `--gpus <gpu_num>` to run multi GPU training.
+    You can set async mode parameter server. With async mode, you can specify
+    `--async_mode` to train model asynchronous.
 * Run distributed training with parameter servers:
+    * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
    * start parameter servers:
        ```bash
        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
+        sleep 15
        ```
    * start trainers:
        ```bash
@@ -42,6 +46,16 @@ Currently supported `--model` argument include:
    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
    ```
+## Prepare the RecordIO file to Achieve Better Performance
+Run the following command will generate RecordIO files like "mnist.recordio" under the path
+and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
+at any time using `fluid.batch`.
+```bash
+python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
+```
 ## Run Distributed Benchmark on Kubernetes Cluster
 You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will

--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+__all__ = ['parse_args', ]
+BENCHMARK_MODELS = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
+def parse_args():
+    parser = argparse.ArgumentParser('Fluid model benchmarks.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=BENCHMARK_MODELS,
+        default='resnet',
+        help='The model to run benchmark with.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    #  args related to learning rate
+    parser.add_argument(
+        '--learning_rate', type=float, default=0.001, help='The learning rate.')
+    # TODO(wuyi): add "--use_fake_data" option back.
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=1,
+        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    # this option is available only for vgg and resnet.
+    parser.add_argument(
+        '--cpus',
+        type=int,
+        default=1,
+        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--no_test',
+        action='store_true',
+        help='If set, do not test the testset during training.')
+    parser.add_argument(
+        '--memory_optimize',
+        action='store_true',
+        help='If set, optimize runtime memory before start.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='If set ommit the actual read data operators.')
+    parser.add_argument(
+        '--profile', action='store_true', help='If set, profile a few steps.')
+    parser.add_argument(
+        '--update_method',
+        type=str,
+        default='local',
+        choices=['local', 'pserver', 'nccl2'],
+        help='Choose parameter update method, can be local, pserver, nccl2.')
+    parser.add_argument(
+        '--no_split_var',
+        action='store_true',
+        default=False,
+        help='Whether split variables into blocks when update_method is pserver')
+    parser.add_argument(
+        '--async_mode',
+        action='store_true',
+        default=False,
+        help='Whether start pserver in async mode to support ASGD')
+    parser.add_argument(
+        '--use_reader_op',
+        action='store_true',
+        help='Whether to use reader op, and must specify the data path if set this to true.'
+    )
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the training recordio files.')
+    args = parser.parse_args()
+    return args
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -24,90 +24,7 @@ import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
 import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
-BENCHMARK_MODELS = [
+from args import *
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
-]
-def parse_args():
-    parser = argparse.ArgumentParser('Fluid model benchmarks.')
-    parser.add_argument(
-        '--model',
-        type=str,
-        choices=BENCHMARK_MODELS,
-        default='resnet',
-        help='The model to run benchmark with.')
-    parser.add_argument(
-        '--batch_size', type=int, default=32, help='The minibatch size.')
-    parser.add_argument(
-        '--learning_rate',
-        type=float,
-        default=0.001,
-        help='The minibatch size.')
-    # TODO(wuyi): add "--use_fake_data" option back.
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=100, help='The number of passes.')
-    parser.add_argument(
-        '--data_format',
-        type=str,
-        default='NCHW',
-        choices=['NCHW', 'NHWC'],
-        help='The data data_format, now only support NCHW.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--gpus',
-        type=int,
-        default=1,
-        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
-    parser.add_argument(
-        '--data_set',
-        type=str,
-        default='flowers',
-        choices=['cifar10', 'flowers'],
-        help='Optional dataset for benchmark.')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
-    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    parser.add_argument(
-        '--no_test',
-        action='store_false',
-        help='If set, test the testset during training.')
-    parser.add_argument(
-        '--memory_optimize',
-        action='store_true',
-        help='If set, optimize runtime memory before start.')
-    parser.add_argument(
-        '--use_fake_data',
-        action='store_true',
-        help='If set ommit the actual read data operators.')
-    parser.add_argument(
-        '--profile', action='store_true', help='If set, profile a few steps.')
-    parser.add_argument(
-        '--update_method',
-        type=str,
-        default='local',
-        choices=['local', 'pserver', 'nccl2'],
-        help='Choose parameter update method, can be local, pserver, nccl2.')
-    args = parser.parse_args()
-    return args
 def append_nccl2_prepare(trainer_id):
@@ -142,7 +59,7 @@ def append_nccl2_prepare(trainer_id):
                        "nccl-based dist train.")
-def dist_transpile(trainer_id):
+def dist_transpile(trainer_id, args):
    if trainer_id < 0:
        return None, None
@@ -164,7 +81,12 @@ def dist_transpile(trainer_id):
    training_role = os.getenv("PADDLE_TRAINING_ROLE")
    t = distribute_transpiler.DistributeTranspiler()
-    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+    t.transpile(
+        trainer_id,
+        pservers=pserver_endpoints,
+        trainers=trainers,
+        sync_mode=not args.async_mode,
+        slice_var_up=not args.no_split_var)
    if training_role == "PSERVER":
        pserver_program = t.get_pserver_program(current_endpoint)
        pserver_startup_program = t.get_startup_program(current_endpoint,
@@ -208,36 +130,57 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(startup_prog)
-    feed_var_list = [
-        var for var in train_prog.global_block().vars.itervalues()
+    if not args.use_reader_op:
-        if var.is_data
+        feed_var_list = [
-    ]
+            var for var in train_prog.global_block().vars.itervalues()
-    feeder = fluid.DataFeeder(feed_var_list, place)
+            if var.is_data
+        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
+        if not args.use_reader_op:
+            reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+                if data == None:
+                    break
+            if iters == args.iterations:
+                break
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
-            if iters == args.iterations:
-                break
+            if args.use_reader_op:
-            loss = exe.run(train_prog,
+                try:
-                           feed=feeder.feed(data),
+                    loss = exe.run(train_prog, fetch_list=[avg_loss])
-                           fetch_list=[avg_loss])
+                except fluid.core.EnforceNotMet as ex:
+                    break
+            else:
+                loss = exe.run(train_prog,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_loss])
            iters += 1
-            num_samples += len(data)
+            batch_id += 1
+            # FIXME(wuyi): For use_reader_op, if the current
+            # pass is not the last, the last batch of this pass
+            # is also equal to args.batch_size.
+            if args.use_reader_op:
+                num_samples += args.batch_size * args.gpus
+            else:
+                num_samples += len(data)
            train_losses.append(loss)
            print("Pass: %d, Iter: %d, Loss: %f\n" %
                  (pass_id, iters, np.mean(train_losses)))
-        train_elapsed = time.time() - start_time
+        print_train_time(start_time, time.time(), num_samples)
-        examples_per_sec = num_samples / train_elapsed
+        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sec\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses)))
        # evaluation
-        if not args.no_test and batch_acc != None:
+        if not args.no_test and batch_acc and not args.use_reader_op:
            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
                                 batch_acc)
            print(", Test Accuracy: %f" % pass_test_acc)
@@ -251,10 +194,14 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
 def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
                   num_trainers, trainer_id):
-    feed_var_list = [
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-        var for var in train_prog.global_block().vars.itervalues()
+    if not args.use_reader_op:
-        if var.is_data
+        feed_var_list = [
-    ]
+            var for var in train_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
@@ -271,7 +218,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                       "value": 1.0,
                       "dtype": var.dtype})
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    if nccl_id_var and trainer_id == 0:
        #FIXME(wuyi): wait other trainer to start listening
        time.sleep(30)
@@ -288,12 +234,21 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
        num_trainers=num_trainers,
        trainer_id=trainer_id)
-    feeder = fluid.DataFeeder(feed_var_list, place)
    for pass_id in range(args.pass_num):
        num_samples = 0
        iters = 0
        start_time = time.time()
-        for batch_id, data in enumerate(train_reader()):
+        if not args.use_reader_op:
+            reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+                if data == None:
+                    break
+            if iters == args.iterations:
+                break
            if args.profile and pass_id == 0 and batch_id == 5:
                profiler.start_profiler("All")
            elif args.profile and pass_id == 0 and batch_id == 10:
@@ -302,39 +257,50 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
-            if iters == args.iterations:
+            if args.use_fake_data or args.use_reader_op:
-                break
+                try:
-            if args.use_fake_data:
+                    loss, = exe.run([avg_loss.name])
-                loss, = exe.run([avg_loss.name])
+                except fluid.core.EnforceNotMet as ex:
+                    break
            else:
                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
            if args.update_method == "pserver":
                exe.bcast_params()
-            num_samples += len(data)
+            if args.use_reader_op:
+                num_samples += args.batch_size * args.gpus
+            else:
+                num_samples += len(data)
            iters += 1
            if batch_id % 1 == 0:
                print("Pass %d, batch %d, loss %s" %
                      (pass_id, batch_id, np.array(loss)))
-        train_elapsed = time.time() - start_time
+            batch_id += 1
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+        print_train_time(start_time, time.time(), num_samples)
-              (num_samples, train_elapsed, examples_per_sec))
+        if not args.no_test and batch_acc and not args.use_reader_op:
-        if not args.no_test and batch_acc != None:
+            # we have not implement record io for test
+            # skip test when use args.use_reader_op
            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
                            batch_acc)
            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
-        exit(0)
 def print_arguments(args):
    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
                                vars(args)['device'] == 'GPU')
-    print('----------- resnet Configuration Arguments -----------')
+    print('----------- Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
+def print_train_time(start_time, end_time, num_samples):
+    train_elapsed = end_time - start_time
+    examples_per_sec = num_samples / train_elapsed
+    print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+          (num_samples, train_elapsed, examples_per_sec))
 def main():
    args = parse_args()
    print_arguments(args)
@@ -342,7 +308,7 @@ def main():
    # the unique trainer id, starting from 0, needed by trainer
    # only
    nccl_id_var, num_trainers, trainer_id = (
-        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
+        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
    if args.use_cprof:
        pr = cProfile.Profile()
@@ -356,7 +322,7 @@ def main():
        fluid.memory_optimize(fluid.default_main_program())
    if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile(trainer_id)
+        train_prog, startup_prog = dist_transpile(trainer_id, args)
        if not train_prog:
            raise Exception(
                "Must configure correct environments to run dist train.")

--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -197,6 +197,8 @@ def lodtensor_to_ndarray(lod_tensor):
 def get_model(args):
+    if args.use_reader_op:
+        raise Exception("machine_translation do not support reader op for now.")
    embedding_dim = 512
    encoder_size = 512
    decoder_size = 512
@@ -221,7 +223,7 @@ def get_model(args):
    train_batch_generator = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
    test_batch_generator = paddle.batch(
        paddle.reader.shuffle(

--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -20,6 +20,7 @@ import numpy as np
 import argparse
 import time
 import cProfile
+import os
 import paddle
 import paddle.fluid as fluid
@@ -65,19 +66,49 @@ def cnn_model(data):
 def get_model(args):
-    # Input data
+    if args.use_reader_op:
-    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        filelist = [
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
-    # Train program
+        data_file = fluid.layers.open_files(
-    predict = cnn_model(images)
+            filenames=filelist,
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
+            shapes=[[-1, 1, 28, 28], (-1, 1)],
-    avg_cost = fluid.layers.mean(x=cost)
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
-    # Evaluator
+            thread_num=args.gpus,
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            pass_num=args.pass_num)
-    batch_acc = fluid.layers.accuracy(
+        data_file = fluid.layers.double_buffer(
-        input=predict, label=label, total=batch_size_tensor)
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    if args.device == 'CPU' and args.cpus > 1:
+        places = fluid.layers.get_places(args.cpus)
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            predict = cnn_model(pd.read_input(images))
+            label = pd.read_input(label)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            batch_acc = fluid.layers.accuracy(input=predict, label=label)
+            pd.write_output(avg_cost)
+            pd.write_output(batch_acc)
+        avg_cost, batch_acc = pd()
+        avg_cost = fluid.layers.mean(avg_cost)
+        batch_acc = fluid.layers.mean(batch_acc)
+    else:
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        # Evaluator
+        batch_acc = fluid.layers.accuracy(input=predict, label=label)
    # inference program
    inference_program = fluid.default_main_program().clone()
@@ -88,7 +119,7 @@ def get_model(args):
    # Reader
    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=args.batch_size)
    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import functools
 import numpy as np
 import time
+import os
 import cProfile, pstats, StringIO
@@ -26,6 +27,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
+from recordio_converter import imagenet_train, imagenet_test
 def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
@@ -122,40 +124,85 @@ def get_model(args):
        else:
            dshape = [32, 32, 3]
        model = resnet_cifar10
-    else:
+        train_reader = paddle.dataset.cifar.train10()
+        test_reader = paddle.dataset.cifar.test10()
+    elif args.data_set == "flowers":
        class_dim = 102
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]
        model = resnet_imagenet
+        train_reader = paddle.dataset.flowers.train()
-    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        test_reader = paddle.dataset.flowers.test()
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    elif args.data_set == "imagenet":
-    predict = model(input, class_dim)
+        class_dim = 1000
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
+        if args.data_format == 'NCHW':
-    avg_cost = fluid.layers.mean(x=cost)
+            dshape = [3, 224, 224]
+        else:
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            dshape = [224, 224, 3]
-    batch_acc = fluid.layers.accuracy(
+        model = resnet_imagenet
-        input=predict, label=label, total=batch_size_tensor)
+        if not args.data_path:
+            raise Exception(
+                "Must specify --data_path when training with imagenet")
+        train_reader = imagenet_train(args.data_path)
+        test_reader = imagenet_test(args.data_path)
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + dshape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        input, label = fluid.layers.read_file(data_file)
+    else:
+        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    if args.device == 'CPU' and args.cpus > 1:
+        places = fluid.layers.get_places(args.cpus)
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            predict = model(pd.read_input(input), class_dim)
+            label = pd.read_input(label)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            batch_acc = fluid.layers.accuracy(input=predict, label=label)
+            pd.write_output(avg_cost)
+            pd.write_output(batch_acc)
+        avg_cost, batch_acc = pd()
+        avg_cost = fluid.layers.mean(avg_cost)
+        batch_acc = fluid.layers.mean(batch_acc)
+    else:
+        predict = model(input, class_dim)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        batch_acc = fluid.layers.accuracy(input=predict, label=label)
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
+            target_vars=[batch_acc])
    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-    train_reader = paddle.batch(
+    batched_train_reader = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
+            train_reader, buf_size=5120),
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+        batch_size=args.batch_size * args.gpus,
-            buf_size=5120),
+        drop_last=True)
-        batch_size=args.batch_size)
+    batched_test_reader = paddle.batch(
-    test_reader = paddle.batch(
+        train_reader, batch_size=args.batch_size, drop_last=True)
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+    return avg_cost, inference_program, optimizer, batched_train_reader,\
-        batch_size=args.batch_size)
+                   batched_test_reader, batch_acc
-    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size):
 def get_model(args):
+    if args.use_reader_op:
+        raise Exception(
+            "stacked_dynamic_lstm do not support reader op for now.")
    lstm_size = 512
    emb_dim = 512
    crop_size = 1500
@@ -115,7 +118,7 @@ def get_model(args):
    train_reader = batch(
        paddle.reader.shuffle(
            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
    test_reader = batch(
        paddle.reader.shuffle(
            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),

--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -22,6 +22,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse
 import functools
+import os
 def vgg16_bn_drop(input):
@@ -65,9 +66,25 @@ def get_model(args):
        else:
            data_shape = [224, 224, 3]
-    # Input data
+    if args.use_reader_op:
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+        filelist = [
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + data_shape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
+        images = fluid.layers.data(
+            name='data', shape=data_shape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
    net = vgg16_bn_drop(images)
@@ -95,7 +112,7 @@ def get_model(args):
            paddle.dataset.cifar.train10()
            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
            buf_size=5120),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),

--- a/benchmark/fluid/recordio_converter.py
+++ b/benchmark/fluid/recordio_converter.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import random
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.dataset import mnist, cifar, flowers, image
+def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
+                       shape_label):
+    num_batches = 0
+    with fluid.program_guard(fluid.Program(), fluid.Program()):
+        reader = paddle.batch(py_reader(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(
+            feed_list=[  # order is image and label
+                fluid.layers.data(
+                    name='image', shape=shape_data),
+                fluid.layers.data(
+                    name='label', shape=shape_label, dtype='int64'),
+            ],
+            place=fluid.CPUPlace())
+        num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+            outfilepath, reader, feeder)
+    return num_batches
+def prepare_mnist(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "mnist.recordio")
+    convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
+def prepare_cifar10(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "cifar.recordio")
+    convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
+def prepare_flowers(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "flowers.recordio")
+    convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
+                       [1])
+def default_mapper(sample):
+    img, label = sample
+    img = image.simple_transform(
+        img, 256, 224, True, mean=[103.94, 116.78, 123.68])
+    return img.flatten().astype('float32'), label
+def imagenet_train(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "train.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+    # shuffle all, this is slow
+    random.shuffle(imgfilelist)
+    def train_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            data = image.load_image(
+                os.path.join(data_dir, "train", imgfile.lower()))
+            label = [img2label[imgfile], ]
+            yield [data, label]
+    return paddle.reader.map_readers(default_mapper, train_reader)
+def imagenet_test(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "val.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+    def test_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
+            image_path = ".".join([base_path, "jpeg"])
+            data = image.load_image(image_path)
+            label = [img2label[imgfile], ]
+            yield [data, label]
+    return paddle.reader.map_readers(default_mapper, test_reader)
+# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
+def convert_reader_to_recordio_files(
+        filename,
+        batch_per_file,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    f_name, f_ext = os.path.splitext(filename)
+    assert (f_ext == ".recordio")
+    lines = []
+    f_idx = 0
+    counter = 0
+    for idx, batch in enumerate(reader_creator()):
+        lines.append(batch)
+        if idx >= batch_per_file and idx % batch_per_file == 0:
+            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
+            with fluid.recordio_writer.create_recordio_writer(
+                    filename, compressor, max_num_records) as writer:
+                for l in lines:
+                    res = feeder.feed(l)
+                    for each in feed_order:
+                        writer.append_tensor(res[each])
+                    writer.complete_append_tensor()
+                    counter += 1
+                lines = []
+                f_idx += 1
+            print("written file: ", filename)
+    return counter
+def prepare_imagenet(inpath, outpath, batch_size):
+    r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            fluid.layers.data(
+                name="image", shape=[3, 224, 224]), fluid.layers.data(
+                    name="label", shape=[1], dtype='int64')
+        ],
+        place=fluid.CPUPlace())
+    outpath = os.path.join(outpath, "imagenet.recordio")
+    convert_reader_to_recordio_files(outpath, 10000, r, feeder)
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@@ -2,6 +2,7 @@
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.
+mkdir -p logs
 #export FLAGS_fraction_of_gpu_memory_to_use=0.0
 export CUDNN_PATH=/paddle/cudnn_v5
@@ -35,6 +36,7 @@ nohup stdbuf -oL nvidia-smi \
      --format=csv \
      --filename=mem.log  \
      -l 1 &
 # mnist
 # mnist gpu mnist 128
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
@@ -43,7 +45,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=500 \
-               2>&1 | tee -a mnist_gpu_128.log
+               2>&1 | tee -a logs/mnist_gpu_128.log
 # vgg16
 # gpu cifar10 128
@@ -53,7 +55,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_128.log
+               2>&1 | tee -a logs/vgg16_gpu_128.log
 # flowers gpu  128
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
@@ -63,28 +65,28 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --data_set=flowers \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_flowers_32.log
+               2>&1 | tee -a logs/vgg16_gpu_flowers_32.log
 # resnet50
 # resnet50 gpu cifar10 128
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=resnet50 \
+               --model=resnet \
               --device=GPU \
               --batch_size=128 \
               --data_set=cifar10 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_128.log
+               2>&1 | tee -a logs/resnet50_gpu_128.log
 # resnet50 gpu flowers 64
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=resnet50 \
+               --model=resnet \
               --device=GPU \
               --batch_size=64 \
               --data_set=flowers \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_flowers_64.log
+               2>&1 | tee -a logs/resnet50_gpu_flowers_64.log
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
@@ -94,7 +96,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --batch_size=32 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a lstm_gpu_32.log
+               2>&1 | tee -a logs/lstm_gpu_32.log
 # seq2seq
 # seq2seq gpu wmb 128
@@ -104,4 +106,4 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a lstm_gpu_128.log
+               2>&1 | tee -a logs/lstm_gpu_128.log
--- a/benchmark/fluid/run_fluid_benchmark.sh
+++ b/benchmark/fluid/run_fluid_benchmark.sh
+#!/bin/bash
+PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 &
+sleep 15
+CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
+CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -92,6 +92,9 @@ if(WITH_GPU)
        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
            message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
        endif()
+        if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
+            message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
+        endif()
        include_directories(${TENSORRT_INCLUDE_DIR})
    endif()
 elseif(WITH_AMD_GPU)
@@ -115,6 +118,10 @@ endif()
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
+if(WITH_DISTRIBUTE)
+  add_definitions(-DPADDLE_WITH_DISTRIBUTE)
+endif()
 if(WITH_GOLANG)
  # we need to symlink Paddle directory into GOPATH. If we
  # don't do it and we have code that depends on Paddle, go
@@ -163,3 +170,7 @@ if(WITH_GOLANG)
  endif()
 endif(WITH_GOLANG)
+if(WITH_GRPC)
+    add_definitions(-DPADDLE_WITH_GRPC)
+endif(WITH_GRPC)
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+INCLUDE(ExternalProject)
+SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
+SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
+SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
+SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE)
+INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
+# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf")
+# If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
+ExternalProject_Add(
+    extern_brpc
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/brpc/brpc"
+    GIT_TAG         "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2"
+    PREFIX          ${BRPC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR}
+                    -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    -DCMAKE_PREFIX_PATH=${prefix_path}
+                    -DBRPC_WITH_GLOG=ON
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    LIST_SEPARATOR |
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
+                     -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy)
+ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
+ADD_DEPENDENCIES(brpc extern_brpc)
+LIST(APPEND external_project_dependencies brpc)
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -33,10 +33,19 @@ ELSE()
  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin)
 ENDIF()
+# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
 ExternalProject_Add(
    extern_grpc
    DEPENDS protobuf zlib
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc.tar.xz"
+    # NOTE(wuyi):
+    # this package is generated by following steps:
+    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
+    # 2. submodule update --init
+    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
+    #    checkout and clean other dirs under third_party
+    # 4. remove .git, and package the directory.
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
+    URL_MD5  "c9c58ee7d0e8929a63155af6a2ecdbd0"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
@@ -49,7 +58,6 @@ ExternalProject_Add(
    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
 )
-# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
 ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")

--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+INCLUDE(ExternalProject)
+SET(LEVELDB_SOURCES_DIR ${THIRD_PARTY_PATH}/leveldb)
+SET(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb)
+SET(LEVELDB_INCLUDE_DIR "${LEVELDB_INSTALL_DIR}/include" CACHE PATH "leveldb include directory." FORCE)
+SET(LEVELDB_LIBRARIES "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" CACHE FILEPATH "leveldb library." FORCE)
+INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR})
+ExternalProject_Add(
+    extern_leveldb
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${LEVELDB_SOURCES_DIR}
+    URL "https://github.com/google/leveldb/archive/v1.18.tar.gz"
+    URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
+    INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ 
+        && cp ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES}
+        && cp -r ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/
+    BUILD_IN_SOURCE 1
+)
+ADD_DEPENDENCIES(extern_leveldb snappy)
+ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
+ADD_DEPENDENCIES(leveldb extern_leveldb)
+LIST(APPEND external_project_dependencies leveldb)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -610,3 +610,21 @@ function(grpc_library TARGET_NAME)
    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
 endfunction()
+function(brpc_library TARGET_NAME)
+  set(oneValueArgs PROTO)
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  message(STATUS "generating brpc ${brpc_library_PROTO}")
+  get_filename_component(ABS_PROTO ${brpc_library_PROTO} ABSOLUTE)
+  get_filename_component(PROTO_WE ${brpc_library_PROTO} NAME_WE)
+  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+  protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}")
+  cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}")
+  cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}")
+endfunction()
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -59,3 +59,21 @@ get_inference_program
 ..  autofunction:: paddle.fluid.io.get_inference_program
    :noindex:
+save_checkpoint
+---------------
+..  autofunction:: paddle.fluid.io.save_checkpoint
+    :noindex:
+load_checkpoint
+---------------
+..  autofunction:: paddle.fluid.io.load_checkpoint
+    :noindex:
+clean_checkpoint
+----------------
+..  autofunction:: paddle.fluid.io.clean_checkpoint
+    :noindex:
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -181,6 +181,12 @@ Print
 ..  autofunction:: paddle.fluid.layers.Print
    :noindex:
+is_empty
+--------
+..  autofunction:: paddle.fluid.layers.is_empty
+    :noindex:
 device
 ======
@@ -255,6 +261,19 @@ double_buffer
 ..  autofunction:: paddle.fluid.layers.double_buffer
    :noindex:
+random_data_generator
+---------------------
+..  autofunction:: paddle.fluid.layers.random_data_generator
+    :noindex:
+Preprocessor
+------------
+..  autoclass:: paddle.fluid.layers.Preprocessor
+    :members:
+    :noindex:
 nn
 ==
@@ -594,6 +613,29 @@ roi_pool
 ..  autofunction:: paddle.fluid.layers.roi_pool
    :noindex:
+dice_loss
+---------
+..  autofunction:: paddle.fluid.layers.dice_loss
+    :noindex:
+resize_bilinear
+---------------
+..  autofunction:: paddle.fluid.layers.resize_bilinear
+    :noindex:
+gather
+------
+..  autofunction:: paddle.fluid.layers.gather
+    :noindex:
+random_crop
+-----------
+..  autofunction:: paddle.fluid.layers.random_crop
+    :noindex:
 ops
 ===
@@ -742,6 +784,12 @@ sum
 ..  autofunction:: paddle.fluid.layers.sum
    :noindex:
+shape
+-----
+..  autofunction:: paddle.fluid.layers.shape
+    :noindex:
 sigmoid
 -------
@@ -991,27 +1039,3 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
    :noindex:
-topk
----
-..  autofunction:: paddle.fluid.layers.topk
-    :noindex:
-dice_loss
----
-..  autofunction:: paddle.fluid.layers.dice_loss
-    :noindex:
-resize_bilinear
-____
-..  autofunction:: paddle.fluid.layers.resize_bilinear
-    :noindex:
-gather
-____
-..  autofunction:: paddle.fluid.layers.gather
-    :noindex:
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -47,28 +47,6 @@ DecayedAdagrad
    :members:
    :noindex:
-Adadelta
-----------------
-..  autoclass:: paddle.fluid.optimizer.Adadelta
-    :members:
-    :noindex:
-RMSProp
-----------------
-..  autoclass:: paddle.fluid.optimizer.RMSProp
-    :members:
-    :noindex:
-ModelAverage
-----------------
-..  autoclass:: paddle.fluid.optimizer.ModelAverage
-    :members:
-    :noindex:
 SGDOptimizer
 ------------
@@ -111,25 +89,31 @@ DecayedAdagradOptimizer
    :members:
    :noindex:
+RMSPropOptimizer
+----------------
-AdadeltaOptimizer
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
-----------------
-..  autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
    :members:
    :noindex:
+Adadelta
+--------
-RMSPropOptimizer
+..  autoclass:: paddle.fluid.optimizer.Adadelta
-----------------
+    :members:
+    :noindex:
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+ModelAverage
+------------
+..  autoclass:: paddle.fluid.optimizer.ModelAverage
    :members:
    :noindex:
 Optimizer
 ---------
 ..  autoclass:: paddle.fluid.optimizer.Optimizer
    :members:
    :noindex:
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -23,3 +23,15 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:
+start_profiler
+--------------
+..  autofunction:: paddle.fluid.profiler.start_profiler
+    :noindex:
+stop_profiler
+-------------
+..  autofunction:: paddle.fluid.profiler.stop_profiler
+    :noindex:
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ b/doc/fluid/dev/api_doc_std_cn.md
 # API注释撰写标准
- [API注释模块](#API注释模块)
+- [API注释撰写标准](#api)
- [格式及示例](#格式及示例)
+    - [API注释模块](#api)
- [完整示例](#完整示例)
+    - [格式及示例](#)
+    - [完整示例](#)
 ## API注释模块
@@ -217,4 +218,4 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
 ## 完整示例
-fc 的完整注释见[示例](src/fc.py)。
+fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
--- a/doc/fluid/dev/api_doc_std_en.md
+++ b/doc/fluid/dev/api_doc_std_en.md
 # API Doc Standard
- [API Doc Structure](#API Doc Structure)
+- [API Doc Standard](#api-doc-standard)
- [Format and Examples](#Format and Examples)
+    - [API Doc Structure](#api-doc-structure)
- [Complete Example](#Complete Example)
+    - [Format and Examples](#format-and-examples)
+    - [Complete Example](#complete-example)
 ## API Doc Structure
@@ -223,4 +224,4 @@ Format and examples of each part of API documantation are as follows: (take fc f
 ## Complete Example
-Complete Example of fc please see [here](src/fc.py)。
+Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
--- a/doc/fluid/howto/cluster/fluid_recordio.md
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
+# How to use RecordIO in Fluid
+If you want to use RecordIO as your training data format, you need to convert to your training data
+to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some
+interface to deal with the RecordIO files.
+## Generate RecordIO File
+Before start training with RecordIO files, you need to convert your training data
+to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes
+as follows:
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder)
+```
+The above code snippet would generate a RecordIO `./mnist.recordio` on your host.
+**NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can
+adjust it flexibly while reading it.
+## Use the RecordIO file in a Local Training Job
+PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file
+and then you can use them as a Layer in your network configuration, the sample codes as follows:
+```python
+    data_file = fluid.layers.io.open_recordio_file(
+        filename="./mnist.recordio",
+        shapes=[(-1, 784),(-1, 1)],
+        lod_levels=[0, 0],
+        dtypes=["float32", "int32"])
+    data_file = fluid.layers.io.batch(data_file, batch_size=4)
+    img, label = fluid.layers.io.read_file(data_file)
+    hidden = fluid.layers.fc(input=img, size=100, act='tanh')
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+    fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    avg_loss_np = []
+    # train a pass
+    batch_id = 0
+    while True:
+        tmp, = exe.run(fetch_list=[avg_loss])
+        avg_loss_np.append(tmp)
+        print(batch_id)
+        batch_id += 1
+```
+## Use the RecordIO files in Distributed Training
+1. generate multiple RecordIO files
+For a distributed training job, you may have multiple trainer nodes,
+and one or more RecordIO files for one trainer node, you can use the interface
+`fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data
+into multiple RecordIO files, the sample codes as follows:
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_files(
+          filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder)
+```
+The above codes would generate multiple RecordIO files on your host like:
+```bash
+.
+ \_mnist-00000.recordio
+ |-mnist-00001.recordio
+ |-mnist-00002.recordio
+ |-mnist-00003.recordio
+ |-mnist-00004.recordio
+```
+2. open multiple RecordIO files by `fluid.layers.io.open_files`
+For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes,
+each trainer process reads parts of the whole training data, we usually take the following approach to make the training
+data allocated by each trainer process as uniform as possiable:
+```python
+def gen_train_list(file_pattern, trainers, trainer_id):
+   file_list = glob.glob(file_pattern)
+   ret_list = []
+   for idx, f in enumerate(file_list):
+       if (idx + trainers) % trainers == trainer_id:
+           ret_list.append(f)
+   return ret_list
+trainers = int(os.getenv("TRAINERS"))
+trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+data_file = fluid.layers.io.open_files(
+    filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
+    thread_num=1,
+    shapes=[(-1, 784),(-1, 1)],
+    lod_levels=[0, 0],
+    dtypes=["float32", "int32"])
+img, label = fluid.layers.io.read_file(data_files)
+...
+```
--- a/doc/fluid/howto/optimization/benchmark/README.md
+++ b/doc/fluid/howto/optimization/benchmark/README.md
-../../../../../benchmark/cluster/README.md
\ No newline at end of file
--- a/doc/fluid/howto/optimization/benchmark/vgg16/README.md
+++ b/doc/fluid/howto/optimization/benchmark/vgg16/README.md
-../../../../../../benchmark/cluster/vgg16/README.md
\ No newline at end of file
--- a/doc/fluid/howto/optimization/host_memory_profiling_cn.md
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
+## 堆内存分析和优化
+计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。
+目前有很多内存泄漏分析工具，比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。
+因为Fluid是用Python驱动C++ core来运行，valgrind直接分析非常困难，需要自己编译debug版本的、带valgrind支持的专用Python版本，而且输出的信息中大部分是Python自己的符号和调用信息，分析起来很困难，另外使用valgrind会让程序运行速度变得非常慢，所以不建议使用。
+本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。
+gperftool主要支持以下四个功能：
+- thread-caching malloc
+- heap-checking using tcmalloc
+- heap-profiling using tcmalloc
+- CPU profiler
+Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)。
+对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。
+## 使用流程
+#### 环境
+本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。
+#### 使用流程
+- 安装google-perftools
+```
+apt-get install libunwind-dev 
+apt-get install google-perftools
+```
+- 安装pprof
+```
+go get -u github.com/google/pprof
+```
+- 设置运行环境
+```
+export PPROF_PATH=/root/gopath/bin/pprof
+export PPROF_BINARY_PATH=/root/gopath/bin/pprof
+export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
+```
+- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。
+```
+# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀
+# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump，默认1GB
+env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py
+```
+随着程序的运行，会在perf_log这个文件夹下生成很多文件，如下：
+```
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0001.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0002.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0003.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0004.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0005.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0006.heap
+```
+- 使用pprof对heap文件进行分析。分析有两种模式：
+	- 完整模式。会对当前heap做一个分析，显示目前分配内存一些调用路径。
+	```
+	pprof --pdf python test.log.0012.heap
+	```
+	上述命令会生成一个profile00x.pdf的文件，可以直接打开，例如：[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出，在CPU版本fluid的运行过程中，分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少，所以被忽略了，这对于分配内存泄漏是很不方便的，因为泄漏是一个缓慢的过程，在这种图中是无法看到的。
+	![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png)
+	- Diff模式。可以对两个时刻的heap做diff，把一些内存分配没有发生变化的模块去掉，而把增量部分显示出来。
+	```
+	pprof --pdf --base test.log.0010.heap python test.log.1045.heap
+	```
+	生成的结果为：[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf)
+	从图中可以看出：ProgramDesc这个结构，在两个版本之间增长了200MB+，所以这里有很大的内存泄漏的可能性，最终结果也确实证明是这里造成了泄漏。
+	![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png)
+	![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png)
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -63,16 +63,16 @@ Android的Docker开发镜像向用户提供两个可配置的参数：
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
 ```
 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
 ```
-执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+执行上述`docker run`命令时，容器执行[paddle/scripts/paddle_build.sh build_android](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
 ## 基于Linux交叉编译环境的编译方式
 本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。

--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -36,7 +36,7 @@ $ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
 We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android ./paddle/scripts/paddle_build.sh build_android
 ```
 The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
@@ -70,7 +70,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
 The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
-The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
+The build command, [`paddle/scripts/paddle_build.sh build_android`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
 The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`.

--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
+# Automatic Differentiation with the Tape
+## Automatic Differentiation
+A key challenge in the field of deep learning is to automatically derive the backward pass from the forward pass described algorithmically by researchers.  Such a derivation, or a transformation of the forward pass program, has been long studied before the recent prosperity of deep learning in the field known as [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf).
+## The Tape
+Given the forward pass program (usually in Python in practices), there are two strategies to derive the backward pass:
+1. from the forward pass program itself, or
+1. from the execution trace of the forward pass program, which is often known as the *tape*.
+This article surveys systems that follow the latter strategy.
+## Dynamic Network
+When we train a deep learning model, the tape changes every iteration as the input data change, so we have to re-derive the backward pass every iteration.  This is known as *dynamic network*.
+Deep learning systems that utilize the idea of dynamic network gained their popularities in recent years.  This article surveys two representative systems: [PyTorch](https://pytorch.org/) and [DyNet](https://dynet.readthedocs.io/en/latest/).
+## An Overview
+Both frameworks record a ‘tape’ of the computation and interpreting (or run-time compiling) a transformation of the tape played back in reverse. This tape is a different kind of entity than the original program.[[link]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf)
+Consider the following code feedforward model.
+```python
+x = Variable(randn(20, 1)))
+label = Variable(randint(1))
+W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20))
+h = matmul(W_1, x)
+pred = matmul(W_2, x)
+loss = softmax(pred, label)
+loss.backward()
+```
+### 1) Dynet uses List to encode the Tape
+During the forward execution, a list of operators, in this case `matmul`, `matmul` and `softmax`, are recorded in the tape, along with the necessary information needed to do the backward such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward()`.
+<details> 
+<summary></summary>
+digraph g {
+    graph [
+        rankdir = "LR"
+    ];
+    node [
+        fontsize = "16"
+        shape = "ellipse"
+    ];
+    edge [];
+    "node0" [
+        label = "<f0> type: matmul | <f1> input: W_1, x | <f2> output: h"
+        shape = "record"
+    ];
+    "node1" [
+        label = "<f0> type: matmul | <f1> input: W_2, h | <f2> output: pred"
+        shape = "record"
+    ];
+    "node2" [
+        label = "<f0> type: softmax | <f1> input: pred, label | <f2> output: loss"
+        shape = "record"
+    ];
+    "node0":f0 -> "node1":f0 [];
+    "node1":f0 -> "node2":f0 [];
+}
+</details>
+![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20})
+### 2) Pytorch uses Node Graph to encode the Tape
+The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.
+<details> 
+<summary></summary>
+digraph g {
+    graph [
+        rankdir = "LR"
+    ];
+    subgraph function {
+        node [
+            fontsize = "16"
+            style = filled
+            shape = "record"
+        ];
+        "matmul0" [ label = "<f0> type: matmul | prev_func: None" ];
+        "matmul1" [ label = "<f0> type: matmul | prev_func: matmul" ];
+        "softmax" [ label = "<f0> type: softmax | prev_func: matmul" ];
+    }
+    subgraph variable {
+        node [
+            fontsize = "16"
+            shape = "Mrecord"
+            style = filled
+            fillcolor = white
+        ];
+        "x" [ label = "<f0> x | <f1> creator: None" ];
+        "label" [ label = "<f0> label | <f1> creator: None" ];
+        "W_1" [ label = "<f0> W_1 | <f1> creator: None" ];
+        "W_2" [ label = "<f0> W_2 | <f1> creator: None" ];
+        "h" [ label = "<f0> h | <f1> creator: None" ];
+        "pred" [ label = "<f0> pred | <f1> creator: matmul" ];
+        "loss" [ label = "<f0> loss | <f1> creator: softmax" ];
+    }
+    subgraph data_flow {
+        "x":f0 -> "matmul0":f0;
+        "W_1":f0 -> "matmul0":f0;
+        "matmul0":f0 -> "h":f0;
+        "h":f0 -> "matmul1":f0;
+        "W_2":f0 -> "matmul1":f0;
+        "matmul1":f0 -> "pred":f0;
+        "pred":f0 -> "softmax":f0;
+        "label":f0 -> "softmax":f0;
+        "softmax":f0 -> "loss":f0;
+    }
+    subgraph prev_func {
+        edge [color="red", arrowsize="0.6", penwidth="1", constraint=false];
+        "matmul1":f1 -> "matmul0":f0;
+        "softmax":f1 -> "matmul1":f0;
+        label = "prev_func";
+    }
+}
+</details>
+![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20})
+Chainer and Autograd uses the similar techniques to record the forward pass. For details please refer to the appendix.
+## Design choices
+### 1) Dynet's List vs Pytorch's Node Graph
+What's good about List:
+1. It avoids a topological sort. One only needs to traverse the list of operators in reverse and calling the corresponding backward operator.
+1. It promises effient data parallelism implementations. One could count the time of usage of a certain variable during the construction list. Then in the play back, one knows the calculation of a variable has completed. This enables communication and computation overlapping.
+What's good about Node Graph:
+1. More flexibility. PyTorch users can mix and match independent graphs however they like, in whatever threads they like (without explicit synchronization). An added benefit of structuring graphs this way is that when a portion of the graph becomes dead, it is automatically freed. [[2]](https://openreview.net/pdf?id=BJJsrmfCZ) Consider the following example, Pytorch only does backward on SmallNet while Dynet does both BigNet and SmallNet.
+```python
+result = BigNet(data)
+loss = SmallNet(data)
+loss.backward()
+```
+### 2) Dynet's Lazy evaluation vs Pytorch's Immediate evaluation
+Dynet builds the list in a symbolic matter. Consider the following example
+```python
+for epoch in range(num_epochs):
+    for in_words, out_label in training_data:
+        dy.renew_cg()
+        W = dy.parameter(W_p)
+        b = dy.parameter(b_p)
+        score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b)
+        loss_sym = dy.pickneglogsoftmax(score_sym, out_label)
+        loss_val = loss_sym.value()
+        loss_sym.backward()
+```
+The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion.
+Pytorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
+## What can fluid learn from them?
+TBD
+# Appendix
+### Overview
+| Framework | Has Tape | Core in C++ | First Release Date |
+|-----------|----------|-------------|--------------------|
+| Autograd  | No       | No          | Mar 5, 2015        |
+| Chainer   | No       | No          | Jun 5, 2015        |
+| Pytorch   | No       | Yes         | Aug 31, 2016       |
+| Dynet     | Yes      | Yes         | Oct 12, 2016       |
+### Source Code
+#### Autograd
+[Backward code](https://github.com/HIPS/autograd/blob/442205dfefe407beffb33550846434baa90c4de7/autograd/core.py#L8-L40). In the forward pass, a graph of VJPNode is constructed.
+```python
+# User API
+def make_grad(fun, x):
+    start_node = VJPNode.new_root()
+    end_value, end_node =  trace(start_node, fun, x)
+    return backward_pass(g, end_node), end_value
+# trace the forward pass by creating VJPNodes
+def trace(start_node, fun, x):
+    with trace_stack.new_trace() as t:
+        start_box = new_box(x, t, start_node)
+        end_box = fun(start_box)
+        return end_box._value, end_box._node
+def backward_pass(g, end_node):
+    outgrads = {end_node : (g, False)}
+    for node in toposort(end_node):
+        outgrad = outgrads.pop(node)
+        ingrads = node.vjp(outgrad[0])
+        for parent, ingrad in zip(node.parents, ingrads):
+            outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad)
+    return outgrad[0]
+# Every VJPNode corresponds to a op_grad
+class VJPNode(Node):
+    __slots__ = ['parents', 'vjp']
+    def __init__(self, value, fun, args, kwargs, parent_argnums, parents):
+        self.parents = parents
+        vjpmaker = primitive_vjps[fun]
+        self.vjp = vjpmaker(parent_argnums, value, args, kwargs)
+```
+#### Chainer
+Example Code
+```python
+# (1) Function Set definition, creates FunctionNode
+model = FunctionSet(
+    l1=F.Linear(784, 100),
+    l2=F.Linear(100, 100),
+    l3=F.Linear(100, 10)).to_gpu()
+# (2) Optimizer Setup
+opt = optimizers.SGD()
+opt.setup(model)
+# (3) Forward computation
+def forward(x, t):
+    h1 = F.relu(model.l1(x))
+    h2 = F.relu(model.l2(h1))
+    y = model.l3(h2)
+    return F.softmax_cross_entropy(y, t)
+# (4) Training loop
+for epoch in xrange(n_epoch):
+    for i in xrange(0, N, b_size):
+        x = Variable(to_gpu(...))
+        t = Variable(to_gpu(...))
+        opt.zero_grads()
+        loss = forward(x, t)
+        loss.backward()
+        opt.update()
+```
+In `forward(x, t)`, a graph of [`VariableNode`](https://github.com/chainer/chainer/blob/master/chainer/variable.py#L110) and [`FunctionNode`](https://github.com/chainer/chainer/blob/a69103a4aa59d5b318f39b01dbcb858d465b89cf/chainer/function_node.py#L19) is constructed. Every output's `VariableNode.creator` is pointed to the `FunctionNode`.
+```python
+class FunctionNode(object):
+    ...
+    def apply(self, inputs):
+        outputs = self.forward(inputs)
+        ret = tuple([variable.Variable(y, requires_grad=requires_grad)
+                     for y in outputs])
+        # Topological ordering
+        self.rank = max([x.rank for x in inputs]) if input_vars else 0
+        # Add backward edges
+        for y in ret:
+            y.creator_node = self
+        self.inputs = tuple([x.node for x in input_vars])
+        self.outputs = tuple([y.node for y in ret])
+        return ret
+```
+`loss.backward()` will calculate the accumulated gradient of all variables. All the backward of `FunctionNode`s will be called based on the topological order.
+```python
+class VariableNode(object):
+    ...
+    def backward(self, retain_grad, loss_scale):
+        if self.creator_node is None:
+            return
+        cand_funcs = []
+        seen_set = set()
+        grads = {}
+        # Initialize error by 1, if this is a loss variable
+        if self.data.size == 1 and self._grad_var is None:
+            self.grad = numpy.ones_like(self.data)
+        grads[self._node] = self._grad_var
+        def add_cand(cand):
+            if cand not in seen_set:
+                # Negate since heapq is min-heap. This is a global variable
+                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
+                seen_set.add(cand)
+        add_cand(self.creator_node)
+        while cand_funcs:
+            _, _, func = heapq.heappop(cand_funcs)
+            gxs = func.backward_accumulate(func.inputs, func.outputs, func.outputs.grad)
+            for x, gx in enumerate(gxs):
+                if x in grads:
+                    grads[x] += gx
+                else:
+                    grads[x] = gx
+                if x.creator_node is not None:
+                    add_cand(x.creator_node)
+```
+#### PyTorch
+Example Code
+```python
+x = Variable(torch.ones(5, 5))
+y = Variable(torch.ones(5, 5) * 4)
+z = x ** 2 + x * 2 + x * y + y
+z.backward(torch.ones(5, 5))
+```
+The trace is done by `Variable.creator` and `Function.previous_functions`.
+```python
+class Variable(object):
+    def __init__(self, tensor, creator=None, requires_grad=True):
+        if creator is None:
+            creator = Leaf(self, requires_grad)
+        self.data = tensor
+        self.creator = creator
+        self._grad = None
+    def backward(self, gradient=None):
+        if gradient is None:
+            if self.data.numel() != 1:
+                raise RuntimeError('backward should be called only on a scalar (i.e. 1-element tensor) or with gradient w.r.t. the variable')
+            gradient = self.data.new(1).fill_(1)
+        self._execution_engine.run_backward(self, gradient)
+class Function(obejct):
+    # ...
+    def _do_forward(self, *input):
+        unpacked_input = tuple(arg.data for arg in input)
+        raw_output = self.forward(*unpacked_input)
+        # mark output.creator = self for backward trace
+        output = tuple(Variable(tensor, self) for tensor in raw_output)
+        self.previous_functions = [(arg.creator, id(arg)) for arg in input]
+        self.output_ids = {id(var): i for i, var in enumerate(output)}
+        return output
+    def _do_backward(self, grad_output):
+        return self.backwaerd(grad_output)
+```
+The [backward](https://github.com/pytorch/pytorch/blob/v0.1.1/torch/autograd/engine.py) is similar to Autograd.
+#### DyNet
+Example code
+```python
+model = dy.model()
+W_p = model.add_parameters((20, 100))
+b_p = model.add_parameters(20)
+E = model.add_lookup_parameters((20000, 50))
+for epoch in range(num_epochs):
+    for in_words, out_label in training_data:
+        dy.renew_cg() # init tape
+        W = dy.parameter(W_p)
+        b = dy.parameter(b_p)
+        score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b)
+        loss_sym = dy.pickneglogsoftmax(score_sym, out_label)
+        loss_val = loss_sym.value()
+        loss_sym.backward()
+```
+[forward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L84-L158), [backward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L166-L284). The trace is done by creating a tape of expressions in every iteration. Backward is done by traverse the tape in the reverse order.
+```c++
+void SimpleExecutionEngine::backward(VariableIndex from_where, bool full) {
+  ...  
+  for (int i = num_nodes - 1; i >= 0; --i) {
+    // each node corresponds to an op
+    node->backward(xs, node_fx, node_dEdfx, ai, node_dEdxai);
+  }
+  ...
+}
+```
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -23,7 +23,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__ 找到 paddle_manylinux_devel
 镜像的编译以及使用方法。或者参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。
-如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。
 编译PaddlePaddle，需要执行：
@@ -106,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 - 学习 Docker 有多难？
-  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+  理解 Docker 并不难，大概花十分钟看一下 `如何使用Docker <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
 - 我可以用 IDE 吗？
@@ -123,7 +123,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 - 可以并行编译吗？
-  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+  是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
 - Docker 需要 sudo
@@ -131,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 - 在 Windows/MacOS 上编译很慢
-  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
 - 磁盘不够
-  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `如何删除Docker Container <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
 .. _compile_deps:
@@ -195,7 +195,7 @@ BLAS
 PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
 `OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
-还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+还会下载MKL-DNN数学库，详细参考 `mkldnn设计文档 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
 如果关闭MKL，则会使用OpenBLAS作为BLAS库。
@@ -211,7 +211,7 @@ PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，
 编译选项的设置
 ++++++++++++++
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如
 ..  code-block:: bash

--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -11,7 +11,7 @@ To build PaddlePaddle, you need
 1. A computer -- Linux, Windows, MacOS.
 2. Docker.
-Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image. 
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.
 We run all the tools by running this image.
 .. _build_step:
@@ -26,6 +26,8 @@ you can also find how to build and use paddle_manylinux_devel Docker image from
 `here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__
 Or you can build your own image from source as the optional step below:
+If you don't wish to use docker，you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation.
 .. code-block:: bash
   # 1. clone the source code
@@ -108,7 +110,7 @@ Frequently Asked Questions
 - How difficult is it to learn Docker?
-    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+    It takes you ten minutes to read `an introductory article <https://docs.docker.com/get-started>`_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
 - Can I use my favorite IDE?
@@ -125,7 +127,7 @@ Frequently Asked Questions
 - Does Docker do parallel building?
-  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+  Our building Docker image runs a  `Bash script <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
 - Docker requires sudo
@@ -133,11 +135,11 @@ Frequently Asked Questions
 - Docker on Windows/MacOS builds slowly
-  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to `this issue  <https://github.com/PaddlePaddle/Paddle/issues/627>`_ for details.
 - Not enough disk space
-  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
+  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to `this article <https://zaiste.net/posts/removing_docker_containers/>`_ .
 .. _compile_deps:

--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -17,6 +17,48 @@ if(APPLE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
+set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files")
+set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
+set(inference_deps paddle_inference_api paddle_fluid_api)
+# if anakin is set enable anakin api implementation
+if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
+    set(ANAKIN_FOUND ON)
+else()
+    set(ANAKIN_FOUND OFF)
+endif()
+function(fetch_include_recursively root_dir) 
+    if (IS_DIRECTORY ${root_dir}) 
+        include_directories(${root_dir})
+    endif()
+    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
+    foreach(sub ${ALL_SUB})
+        if (IS_DIRECTORY ${root_dir}/${sub})
+            fetch_include_recursively(${root_dir}/${sub})
+        endif()
+    endforeach()
+endfunction()
+if (ANAKIN_FOUND)
+    # Anakin's code style doesn't follow google c style.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
+    message(STATUS "Anakin for inference is enabled")
+    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+    fetch_include_recursively(${ANAKIN_INCLUDE})
+    link_directories(${ANAKIN_LIBRARY})
+    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
+    list(APPEND inference_deps inference_anakin_api)
+endif()
 function(inference_api_test TARGET_NAME)
    if (WITH_TESTING)
        set(options "")
@@ -27,7 +69,7 @@ function(inference_api_test TARGET_NAME)
        set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
        cc_test(${TARGET_NAME}
                SRCS ${TARGET_NAME}.cc
-                DEPS paddle_fluid paddle_inference_api
+                DEPS "${inference_deps}"
                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
        if(inference_test_ARGS)
            set_tests_properties(${TARGET_NAME}
@@ -37,7 +79,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 cc_library(paddle_inference_api
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc 
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 cc_test(test_paddle_inference_api
@@ -47,6 +89,11 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                    ARGS test_word2vec test_image_classification)
+if (ANAKIN_FOUND)
+    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+    DEPS ${inference_deps})
+endif()
 if(WITH_TESTING)
    add_subdirectory(demo)
 endif()
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <memory>
+#include <thread>
 #include "paddle/contrib/inference/paddle_inference_api.h"
 namespace paddle {
 namespace demo {
@@ -54,18 +54,75 @@ void Main(bool use_gpu) {
    CHECK(predictor->Run(slots, &outputs));
    //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1);
+    ASSERT_EQ(outputs.size(), 1UL);
    LOG(INFO) << "output buffer size: " << outputs.front().data.length;
    const size_t num_elements = outputs.front().data.length / sizeof(float);
    // The outputs' buffers are in CPU memory.
    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
      LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
    }
+    // TODO(Superjomn): this is should be free automatically
+    free(outputs[0].data.data);
+  }
+}
+void MainThreads(int num_threads, bool use_gpu) {
+  // Multi-threads only support on CPU
+  // 0. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto main_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_threads; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // 1. clone a predictor which shares the same parameters
+      auto predictor = main_predictor->Clone();
+      constexpr int num_batches = 3;
+      for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
+        // 2. Dummy Input Data
+        int64_t data[4] = {1, 2, 3, 4};
+        PaddleBuf buf{.data = data, .length = sizeof(data)};
+        PaddleTensor tensor{.name = "",
+                            .shape = std::vector<int>({4, 1}),
+                            .data = buf,
+                            .dtype = PaddleDType::INT64};
+        std::vector<PaddleTensor> inputs(4, tensor);
+        std::vector<PaddleTensor> outputs;
+        // 3. Run
+        CHECK(predictor->Run(inputs, &outputs));
+        // 4. Get output.
+        ASSERT_EQ(outputs.size(), 1UL);
+        LOG(INFO) << "TID: " << tid << ", "
+                  << "output buffer size: " << outputs.front().data.length;
+        const size_t num_elements = outputs.front().data.length / sizeof(float);
+        // The outputs' buffers are in CPU memory.
+        for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+          LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+        }
+        free(outputs[0].data.data);
+      }
+    });
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i].join();
  }
 }
 TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
+#ifdef PADDLE_WITH_CUDA
 TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
+#endif
 }  // namespace demo
 }  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 /*
 * This file contains the definition of a simple Inference API for Paddle.
@@ -47,8 +47,8 @@ struct PaddleTensor {
 enum class PaddleEngineKind {
  kNative = 0,  // Use the native Fluid facility.
+  kAnakin,      // Use Anakin for inference.
  // TODO(Superjomn) support following engines latter.
-  // kAnakin,             // Use Anakin for inference.
  // kTensorRT,           // Use TensorRT for inference.
  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
@@ -63,6 +63,7 @@ class PaddlePredictor {
  struct Config;
  PaddlePredictor() = default;
  PaddlePredictor(const PaddlePredictor&) = delete;
+  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
  // Predict an record.
  // The caller should be responsible for allocating and releasing the memory of
@@ -76,7 +77,7 @@ class PaddlePredictor {
  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
  // Destroy the Predictor.
-  virtual ~PaddlePredictor() {}
+  virtual ~PaddlePredictor() = default;
  // The common configs for all the predictors.
  struct Config {
@@ -95,6 +96,13 @@ struct NativeConfig : public PaddlePredictor::Config {
  std::string param_file;
 };
+// Configurations for Anakin engine.
+struct AnakinConfig : public PaddlePredictor::Config {
+  int device;
+  std::string model_file;
+  int max_batch_size{-1};
+};
 // A factory to help create different predictors.
 //
 // FOR EXTENSION DEVELOPER:
@@ -105,5 +113,4 @@ struct NativeConfig : public PaddlePredictor::Config {
 // Similarly, each engine kind should map to a unique predictor implementation.
 template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 }  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <cuda.h>
+#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
+namespace paddle {
+PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
+    const AnakinConfig &config) {
+  CHECK(Init(config));
+}
+bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
+  if (!(graph_.load(config.model_file))) {
+    return false;
+  }
+  graph_.ResetBatchSize("input_0", config.max_batch_size);
+  // optimization for graph
+  if (!(graph_.Optimize())) {
+    return false;
+  }
+  // construct executer
+  executor_.init(graph_);
+  return true;
+}
+bool PaddleInferenceAnakinPredictor::Run(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<PaddleTensor> *output_data) {
+  for (const auto &input : inputs) {
+    if (input.dtype != PaddleDType::FLOAT32) {
+      LOG(ERROR) << "Only support float type inputs. " << input.name
+                 << "'s type is not float";
+      return false;
+    }
+    auto d_tensor_in_p = executor_.get_in(input.name);
+    float *d_data_p = d_tensor_in_p->mutable_data();
+    if (cudaMemcpy(d_data_p,
+                   static_cast<float *>(input.data.data),
+                   d_tensor_in_p->valid_size() * sizeof(float),
+                   cudaMemcpyHostToDevice) != 0) {
+      LOG(ERROR) << "copy data from CPU to GPU error";
+      return false;
+    }
+  }
+  executor_.prediction();
+  if (output_data->empty()) {
+    LOG(ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+  for (auto &output : *output_data) {
+    auto *tensor = executor_.get_out(output.name);
+    output.shape = tensor->shape();
+    // Copy data from GPU -> CPU
+    if (cudaMemcpy(output.data.data,
+                   tensor->mutable_data(),
+                   tensor->valid_size() * sizeof(float),
+                   cudaMemcpyDeviceToHost) != 0) {
+      LOG(ERROR) << "copy data from GPU to CPU error";
+      return false;
+    }
+  }
+  return true;
+}
+anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor::get_executer() {
+  return executor_;
+}
+// the cloned new Predictor of anakin share the same net weights from original
+// Predictor
+std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
+  VLOG(3) << "Anakin Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
+  // construct executer from other graph
+  auto anakin_predictor_p =
+      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
+  if (!anakin_predictor_p) {
+    LOG(ERROR) << "fail to call Init";
+    return nullptr;
+  }
+  anakin_predictor_p->get_executer().init(graph_);
+  return std::move(cls);
+}
+// A factory to help create difference predictor.
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
+    const AnakinConfig &config) {
+  VLOG(3) << "Anakin Predictor create.";
+  std::unique_ptr<PaddlePredictor> x(
+      new PaddleInferenceAnakinPredictor(config));
+  return x;
+};
+}  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+/*
+ * This file contains the implementation of inference API with Anakin engine
+ * embeded, this API can only support Anakin models.
+ */
+#pragma once
+// NOTE This header file do not have namespace.
+//#include <test/framework/net/paddle_api.h>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "framework/core/net/net.h"
+#include "saber/saber_types.h"
+namespace paddle {
+class PaddleInferenceAnakinPredictor : public PaddlePredictor {
+ public:
+  PaddleInferenceAnakinPredictor() {}
+  PaddleInferenceAnakinPredictor(const AnakinConfig& config);
+  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
+  // should be allocated first.
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data) override;
+  std::unique_ptr<PaddlePredictor> Clone() override;
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  get_executer();
+  ~PaddleInferenceAnakinPredictor() override{};
+ private:
+  bool Init(const AnakinConfig& config);
+  anakin::graph::Graph<anakin::NV,
+                       anakin::saber::AK_FLOAT,
+                       anakin::Precision::FP32>
+      graph_;
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+      executor_;
+  AnakinConfig config_;
+};
+}  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/contrib/inference/paddle_inference_api.h"
+namespace paddle {
+AnakinConfig GetConfig() {
+  AnakinConfig config;
+  config.model_file = "./mobilenet_v2.anakin.bin";
+  config.device = 0;
+  config.max_batch_size = 1;
+  return config;
+}
+TEST(inference, anakin) {
+  AnakinConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+  float data[1 * 3 * 224 * 224] = {1.0f};
+  PaddleBuf buf{.data = data, .length = sizeof(data)};
+  PaddleTensor tensor{.name = "input_0",
+                      .shape = std::vector<int>({1, 3, 224, 224}),
+                      .data = buf,
+                      .dtype = PaddleDType::FLOAT32};
+  // For simplicity, we set all the slots with the same data.
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+  float data_out[1000];
+  PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
+  PaddleTensor tensor_out{.name = "prob_out",
+                          .shape = std::vector<int>({1000, 1}),
+                          .data = buf_out,
+                          .dtype = PaddleDType::FLOAT32};
+  std::vector<PaddleTensor> outputs(1, tensor_out);
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+  float* data_o = static_cast<float*>(outputs[0].data.data);
+  for (size_t j = 0; j < 1000; ++j) {
+    LOG(INFO) << "output[" << j << "]: " << data_o[j];
+  }
+}
+}  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 #include <sys/time.h>
 #include <algorithm>
@@ -54,7 +54,8 @@ std::string num2str(T a) {
 }
 }  // namespace
-bool NativePaddlePredictor::Init() {
+bool NativePaddlePredictor::Init(
+    std::shared_ptr<framework::Scope> parent_scope) {
  VLOG(3) << "Predictor::init()";
  if (config_.use_gpu) {
@@ -62,9 +63,15 @@ bool NativePaddlePredictor::Init() {
  } else {
    place_ = paddle::platform::CPUPlace();
  }
-  paddle::framework::InitDevices(false);
+  if (parent_scope) {
+    scope_ = parent_scope;
+    sub_scope_ = &(parent_scope->NewScope());
+  } else {
+    paddle::framework::InitDevices(false);
+    scope_.reset(new paddle::framework::Scope());
+  }
  executor_.reset(new paddle::framework::Executor(place_));
-  scope_.reset(new paddle::framework::Scope());
  // Initialize the inference program
  if (!config_.model_dir.empty()) {
@@ -83,13 +90,8 @@ bool NativePaddlePredictor::Init() {
    return false;
  }
  ctx_ = executor_->Prepare(*inference_program_, 0);
+  executor_->CreateVariables(
-  // Create temporary variables first, so that the first batch do not need to
+      *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
-  // create variables in the runtime. This is the logics of the old inference
-  // API.
-  // TODO(Superjomn) this should be modified when `Clone` is valid for
-  // multi-thread application.
-  executor_->CreateVariables(*inference_program_, scope_.get(), 0);
  // Get the feed_target_names and fetch_target_names
  feed_target_names_ = inference_program_->GetFeedTargetNames();
@@ -97,6 +99,13 @@ bool NativePaddlePredictor::Init() {
  return true;
 }
+NativePaddlePredictor::~NativePaddlePredictor() {
+  if (sub_scope_) {
+    PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!");
+    scope_->DeleteScope(sub_scope_);
+  }
+};
 bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                std::vector<PaddleTensor> *output_data) {
  VLOG(3) << "Predictor::predict";
@@ -121,11 +130,12 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
  }
  // Run the inference program
  // if share variables, we need not create variables
-  executor_->RunPreparedContext(ctx_.get(),
+  executor_->RunPreparedContext(
-                                scope_.get(),
+      ctx_.get(),
-                                &feed_targets,
+      sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
-                                &fetch_targets,
+      &feed_targets,
-                                false /* don't create variable eatch time */);
+      &fetch_targets,
+      false /* don't create variable eatch time */);
  if (!GetFetch(fetchs, output_data)) {
    LOG(ERROR) << "fail to get fetchs";
    return false;
@@ -138,7 +148,7 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
  VLOG(3) << "Predictor::clone";
  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
-  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init()) {
+  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(scope_)) {
    LOG(ERROR) << "fail to call Init";
    return nullptr;
  }
@@ -266,7 +276,7 @@ CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
  }
  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
-  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init()) {
+  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
    return nullptr;
  }
  return std::move(predictor);

--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -34,14 +34,15 @@ class NativePaddlePredictor : public PaddlePredictor {
  explicit NativePaddlePredictor(const NativeConfig &config)
      : config_(config) {}
-  bool Init();
+  // will only create sub scope if have global scope
+  bool Init(std::shared_ptr<framework::Scope> parent_scope);
  bool Run(const std::vector<PaddleTensor> &inputs,
           std::vector<PaddleTensor> *output_data) override;
  std::unique_ptr<PaddlePredictor> Clone() override;
-  ~NativePaddlePredictor() override{};
+  ~NativePaddlePredictor() override;
 private:
  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
@@ -52,11 +53,13 @@ class NativePaddlePredictor : public PaddlePredictor {
  NativeConfig config_;
  platform::Place place_;
  std::unique_ptr<framework::Executor> executor_;
-  std::unique_ptr<framework::Scope> scope_;
+  std::shared_ptr<framework::Scope> scope_;
  std::unique_ptr<framework::ExecutorPrepareContext> ctx_;
  std::unique_ptr<framework::ProgramDesc> inference_program_;
  std::vector<std::string> feed_target_names_;
  std::vector<std::string> fetch_target_names_;
+  // Do not use unique_ptr, use parent scope to delete
+  framework::Scope *sub_scope_{nullptr};
 };
 }  // namespace paddle
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <thread>
 #include "gflags/gflags.h"
 #include "paddle/contrib/inference/paddle_inference_api_impl.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
@@ -45,14 +47,19 @@ NativeConfig GetConfig() {
  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
  LOG(INFO) << "dirname  " << config.model_dir;
  config.fraction_of_gpu_memory = 0.15;
+#ifdef PADDLE_WITH_CUDA
  config.use_gpu = true;
+#else
+  config.use_gpu = false;
+#endif
  config.device = 0;
  return config;
 }
-TEST(paddle_inference_api_impl, word2vec) {
+void MainWord2Vec(bool use_gpu) {
  NativeConfig config = GetConfig();
  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
+  config.use_gpu = use_gpu;
  framework::LoDTensor first_word, second_word, third_word, fourth_word;
  framework::LoD lod{{0, 1}};
@@ -100,11 +107,11 @@ TEST(paddle_inference_api_impl, word2vec) {
  free(outputs[0].data.data);
 }
-TEST(paddle_inference_api_impl, image_classification) {
+void MainImageClassification(bool use_gpu) {
  int batch_size = 2;
-  bool use_mkldnn = false;
  bool repeat = false;
  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
  config.model_dir =
      FLAGS_dirname + "image_classification_resnet.inference.model";
@@ -126,12 +133,8 @@ TEST(paddle_inference_api_impl, image_classification) {
  std::vector<framework::LoDTensor*> cpu_fetchs1;
  cpu_fetchs1.push_back(&output1);
-  TestInference<platform::CPUPlace, false, true>(config.model_dir,
+  TestInference<platform::CPUPlace, false, true>(
-                                                 cpu_feeds,
+      config.model_dir, cpu_feeds, cpu_fetchs1, repeat, is_combined);
-                                                 cpu_fetchs1,
-                                                 repeat,
-                                                 is_combined,
-                                                 use_mkldnn);
  auto predictor = CreatePaddlePredictor(config);
  std::vector<PaddleTensor> paddle_tensor_feeds;
@@ -149,4 +152,143 @@ TEST(paddle_inference_api_impl, image_classification) {
  free(data);
 }
+void MainThreadsWord2Vec(bool use_gpu) {
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  // prepare inputs data and reference results
+  constexpr int num_jobs = 3;
+  std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // each job has 4 words
+    jobs[i].resize(4);
+    for (size_t j = 0; j < 4; ++j) {
+      framework::LoD lod{{0, 1}};
+      int64_t dict_size = 2073;  // The size of dictionary
+      SetupLoDTensor(&jobs[i][j], lod, static_cast<int64_t>(0), dict_size - 1);
+      paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j]));
+    }
+    // get reference result of each job
+    std::vector<paddle::framework::LoDTensor*> ref_feeds;
+    std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    for (auto& word : jobs[i]) {
+      ref_feeds.push_back(&word);
+    }
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+      // check outputs range
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length;
+      float* data = static_cast<float*>(local_outputs[0].data.data);
+      for (size_t j = 0; j < len / sizeof(float); ++j) {
+        ASSERT_LT(data[j], 1.0);
+        ASSERT_GT(data[j], -1.0);
+      }
+      // check outputs correctness
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+      }
+      free(data);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+void MainThreadsImageClassification(bool use_gpu) {
+  constexpr int num_jobs = 4;  // each job run 1 batch
+  constexpr int batch_size = 1;
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  config.model_dir =
+      FLAGS_dirname + "image_classification_resnet.inference.model";
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  std::vector<framework::LoDTensor> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // prepare inputs
+    std::vector<std::vector<int64_t>> feed_target_shapes =
+        GetFeedTargetShapes(config.model_dir, /*is_combined*/ false);
+    feed_target_shapes[0][0] = batch_size;
+    framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
+    SetupTensor<float>(&jobs[i], input_dims, 0.f, 1.f);
+    paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i]));
+    // get reference result of each job
+    std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]);
+    std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+      // check outputs correctness
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length;
+      float* data = static_cast<float*>(local_outputs[0].data.data);
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+      }
+      free(data);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); }
+TEST(inference_api_native, word2vec_cpu_threads) {
+  MainThreadsWord2Vec(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu_threads) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+#ifdef PADDLE_WITH_CUDA
+TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
+TEST(inference_api_native, word2vec_gpu_threads) {
+  MainThreadsWord2Vec(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu_threads) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+#endif
 }  // namespace paddle
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -83,11 +83,16 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
+if(WITH_DISTRIBUTE)
-framework_proto glog lod_rank_table feed_fetch_method)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+else()
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
+endif()
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -169,17 +169,13 @@ void BlockDesc::Flush() {
  }
  if (need_update_) {
-    auto &op_field = *this->desc_->mutable_ops();
+    this->desc_->mutable_ops()->Clear();
-    this->ClearPBOps();
-    op_field.Reserve(static_cast<int>(ops_.size()));
    for (auto &op_desc : ops_) {
-      op_field.AddAllocated(op_desc->Proto());
+      this->desc_->mutable_ops()->Add()->CopyFrom(*op_desc->Proto());
    }
-    auto &var_field = *this->desc_->mutable_vars();
+    this->desc_->mutable_vars()->Clear();
-    this->ClearPBVars();
-    var_field.Reserve(static_cast<int>(vars_.size()));
    for (auto &var_desc : vars_) {
-      var_field.AddAllocated(var_desc.second->Proto());
+      this->desc_->mutable_vars()->Add()->CopyFrom(*var_desc.second->Proto());
    }
    need_update_ = false;
  }
@@ -217,22 +213,6 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
  }
 }
-void BlockDesc::ClearPBOps() {
-  auto ops = this->desc_->mutable_ops();
-  while (!ops->empty()) {
-    // we do not own the OpDesc, so release the ownership.
-    ops->ReleaseLast();
-  }
-}
-void BlockDesc::ClearPBVars() {
-  auto vars = this->desc_->mutable_vars();
-  while (!vars->empty()) {
-    // we do not own the VarDesc, so release the ownership.
-    vars->ReleaseLast();
-  }
-}
 void BlockDesc::SetForwardBlockID(int32_t forward_block_id) {
  PADDLE_ENFORCE(!desc_->has_forward_block_idx(),
                 "Parent block ID has been set to %d. Cannot set to %d",

--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -41,11 +41,6 @@ class BlockDesc {
  BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
-  ~BlockDesc() {
-    this->ClearPBVars();
-    this->ClearPBOps();
-  }
  int32_t ID() const { return desc_->idx(); }
  int32_t Parent() const { return desc_->parent_idx(); }
@@ -113,10 +108,6 @@ class BlockDesc {
  ProgramDesc *Program() const { return this->prog_; }
- private:
-  void ClearPBOps();
-  void ClearPBVars();
 private:
  ProgramDesc *prog_;       // not_own
  proto::BlockDesc *desc_;  // not_own

--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
@@ -27,6 +27,7 @@ enum class DataLayout {
  kNHWC = 0,
  kNCHW = 1,
  kAnyLayout = 2,
+  kMKLDNN = 3,  // all layouts supported by MKLDNN internally
 };
 inline DataLayout StringToDataLayout(const std::string& str) {
@@ -41,6 +42,8 @@ inline DataLayout StringToDataLayout(const std::string& str) {
    return DataLayout::kNCHW;
  } else if (s == "ANYLAYOUT") {
    return DataLayout::kAnyLayout;
+  } else if (s == "MKLDNNLAYOUT") {
+    return DataLayout::kMKLDNN;
  } else {
    PADDLE_THROW("Unknown storage order string: %s", s);
  }
@@ -54,8 +57,10 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) {
      return "NCHW";
    case DataLayout::kAnyLayout:
      return "ANY_LAYOUT";
+    case DataLayout::kMKLDNN:
+      return "MKLDNNLAYOUT";
    default:
-      PADDLE_THROW("unknown DataLayou %d", data_layout);
+      PADDLE_THROW("unknown DataLayout %d", data_layout);
  }
 }

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -16,6 +16,9 @@
 #include <vector>
 #include "paddle/fluid/operators/math/math_function.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace framework {
@@ -88,5 +91,85 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var,
  out->set_layout(expected_kernel_type.data_layout_);
 }
+#ifdef PADDLE_WITH_MKLDNN
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
+  switch (type) {
+    case mkldnn::memory::data_type::f32:
+      return platform::to_void_cast(tensor.data<float>());
+    case mkldnn::memory::data_type::s8:
+      return platform::to_void_cast(tensor.data<char>());
+    case mkldnn::memory::data_type::u8:
+      return platform::to_void_cast(tensor.data<unsigned char>());
+    case mkldnn::memory::data_type::s16:
+      return platform::to_void_cast(tensor.data<int16_t>());
+    case mkldnn::memory::data_type::s32:
+      return platform::to_void_cast(tensor.data<int32_t>());
+    default:
+      PADDLE_THROW("wrong mkldnn type provided");
+  }
+}
+#endif
+void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
+                               const OpKernelType& expected_kernel_type,
+                               const Tensor& in, Tensor* out) {
+  auto in_layout = kernel_type_for_var.data_layout_;
+  auto out_layout = expected_kernel_type.data_layout_;
+  PADDLE_ENFORCE(
+      in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN,
+      "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
+      "non-MKLDNN");
+#ifdef PADDLE_WITH_MKLDNN
+  PADDLE_ENFORCE(in.format() != memory::format::format_undef &&
+                     in.format() != memory::format::any,
+                 "Input tensor should have specified memory format");
+  // Set default as NCHW in case not specified
+  out_layout =
+      out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
+      pool.Get(expected_kernel_type.place_));
+  auto& cpu_engine = dev_ctx->GetEngine();
+  std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
+  std::vector<int> out_tz = in_tz;
+  memory::data_type in_type = ToMKLDNNDataType(in.type());
+  PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
+                 "Input tensor type is not supported: ", in.type().name());
+  memory::data_type out_type = in_type;
+  memory::format in_format =
+      in_tz.size() == 2 ? memory::format::nc : in.format();
+  memory::format out_format =
+      out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout);
+  void* in_data = GetDataFromTensor(in, in_type);
+  // output tensor has the same dims as input. Reorder don't change dims
+  out->Resize(in.dims());
+  auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
+  auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+  auto out_memory =
+      memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+  platform::Reorder(in_memory, out_memory);
+  out->set_layout(out_layout);
+  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
+  out->set_format(memory::format::format_undef);
+#endif
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <map>
 #include <vector>
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -22,6 +23,50 @@
 namespace paddle {
 namespace framework {
+#ifdef PADDLE_WITH_MKLDNN
+using MKLDNNFormat = mkldnn::memory::format;
+using MKLDNNDataType = mkldnn::memory::data_type;
+inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) {
+  switch (layout) {
+    case DataLayout::kNHWC:
+      return MKLDNNFormat::nhwc;
+    case DataLayout::kNCHW:
+      return MKLDNNFormat::nchw;
+    default:
+      PADDLE_THROW("Fail to convert layout %s to MKLDNN format",
+                   DataLayoutToString(layout));
+  }
+}
+inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) {
+  switch (format) {
+    case MKLDNNFormat::nhwc:
+      return DataLayout::kNHWC;
+    case MKLDNNFormat::nchw:
+      return DataLayout::kNCHW;
+    default:
+      PADDLE_THROW("Fail to convert MKLDNN format to paddle layout");
+  }
+}
+inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
+  static const std::map<std::type_index, MKLDNNDataType> dict{
+      {std::type_index(typeid(float)), MKLDNNDataType::f32},  // NOLINT
+      {std::type_index(typeid(char)), MKLDNNDataType::s8},    // NOLINT
+      {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8},
+      {std::type_index(typeid(int16_t)), MKLDNNDataType::s16},
+      {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}};
+  auto iter = dict.find(type);
+  if (iter != dict.end()) return iter->second;
+  return MKLDNNDataType::data_undef;
+}
+#endif
+void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
+                               const OpKernelType& expected_kernel_type,
+                               const Tensor& in, Tensor* out);
 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
 void TransDataLayout(const OpKernelType& kernel_type_for_var,

--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -33,11 +33,34 @@ void DataTransform(const OpKernelType& expected_kernel_type,
  Tensor in;
  in.ShareDataWith(input_tensor);
  Tensor out;
+  DataLayout lin = kernel_type_for_var.data_layout_;
+  DataLayout lout = expected_kernel_type.data_layout_;
  // do layout transform
-  if (NeedTransformLayout(expected_kernel_type.data_layout_,
+  if (NeedTransformLayout(lout, lin)) {
-                          kernel_type_for_var.data_layout_)) {
+    if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) {
-    TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+      PADDLE_ENFORCE(
+          !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN),
+          "No layout transform needed between two MKLDNN OPKernels");
+      if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) {
+#ifdef PADDLE_WITH_MKLDNN
+        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
+        // Just set layout/format. No real transform occur
+        out.ShareDataWith(input_tensor);
+        out.set_layout(DataLayout::kMKLDNN);
+        out.set_format(ToMKLDNNFormat(lin));
+#endif
+      } else {
+        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
+        // Do transform via MKLDNN lib
+        TransDataLayoutFromMKLDNN(kernel_type_for_var, expected_kernel_type, in,
+                                  &out);
+      }
+    } else {
+      // Case3 - transfrom between Non-MKLDNN OPKernels
+      TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+    }
    transformed = true;
    PassTensorData(&out, &in);
  }

--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -28,6 +28,9 @@ struct DataTypeMap {
 };
 static DataTypeMap* InitDataTypeMap();
+// C++11 removes the need for manual locking. Concurrent execution shall wait if
+// a static local variable is already being initialized.
+// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 static DataTypeMap& gDataTypeMap() {
  static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
  return *g_data_type_map_;

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -7,26 +7,32 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
+cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 if(WITH_GPU)
-    nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+    nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda)
+            dynload_cuda variable_visitor)
-    set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
 else()
-    set(multi_devices_graph_builder_deps)
+    cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+             variable_visitor)
    cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
    cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 endif()
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle)
+cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
@@ -36,5 +42,6 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha
        device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
        device_context gather_op_handle)
+cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
 #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
 #        device_context reduce_op_handle )
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -11,46 +11,65 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #include <algorithm>
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
 namespace paddle {
 namespace framework {
 namespace details {
-NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
-    const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
-    const std::vector<platform::Place> &places,
+AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
-    const platform::NCCLContextMap &ctxs)
+                                     const std::vector<platform::Place> &places,
+                                     const platform::NCCLContextMap *ctxs)
    : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
-  for (auto &p : places_) {
+  if (nccl_ctxs_) {
-    this->dev_ctxes_[p] = nccl_ctxs_.DevCtx(p);
+    for (auto &p : places_) {
+      this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
+    }
  }
 }
+#else
+AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                                     const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+#endif
-void NCCLAllReduceOpHandle::RunImpl() {
+void AllReduceOpHandle::RunImpl() {
-  if (inputs_.size() == 1) {
+  if (NoDummyInputSize() == 1) {
    return;  // No need to all reduce when GPU count = 1;
  } else {
    // Wait input done
    WaitInputVarGenerated();
+    auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-    auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_;
+    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-    int dtype = -1;
+    PADDLE_ENFORCE_EQ(
-    size_t numel = 0;
+        in_var_handles.size(), places_.size(),
+        "The NoDummyInputSize should be equal to the number of places.");
+    PADDLE_ENFORCE_EQ(
+        in_var_handles.size(), out_var_handles.size(),
+        "The NoDummyInputSize and NoDummyOutputSize should be equal.");
    std::vector<const LoDTensor *> lod_tensors;
    for (size_t i = 0; i < local_scopes_.size(); ++i) {
      auto *s = local_scopes_[i];
      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
+      auto &lod_tensor =
-      auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
+          local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
      lod_tensors.emplace_back(&lod_tensor);
+      PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
+                        "The name of input and output should be equal.");
    }
    if (platform::is_gpu_place(lod_tensors[0]->place())) {
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+      int dtype = -1;
+      size_t numel = 0;
      std::vector<std::function<void()>> all_reduce_calls;
      for (size_t i = 0; i < local_scopes_.size(); ++i) {
        auto &p = places_[i];
@@ -66,7 +85,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
        }
        int dev_id = boost::get<platform::CUDAPlace>(p).device;
-        auto &nccl_ctx = nccl_ctxs_.at(dev_id);
+        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
        auto stream = nccl_ctx.stream();
        auto comm = nccl_ctx.comm_;
        all_reduce_calls.emplace_back([=] {
@@ -81,22 +100,25 @@ void NCCLAllReduceOpHandle::RunImpl() {
          call();
        }
      });
+#else
+      PADDLE_THROW("Not compiled with CUDA");
+#endif
    } else {  // Special handle CPU only Operator's gradient. Like CRF
      auto &trg = *this->local_scopes_[0]
                       ->FindVar(kLocalExecScopeName)
                       ->Get<Scope *>()
-                       ->Var()
+                       ->FindVar(out_var_handles[0]->name_)
                       ->GetMutable<framework::LoDTensor>();
      // Reduce All Tensor to trg in CPU
      ReduceLoDTensor func(lod_tensors, &trg);
      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
-      for (size_t i = 0; i < local_scopes_.size(); ++i) {
+      for (size_t i = 1; i < local_scopes_.size(); ++i) {
        auto &scope =
            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
        auto &p = places_[i];
-        auto *var = scope.FindVar(var_name);
+        auto *var = scope.FindVar(out_var_handles[i]->name_);
        auto *dev_ctx = dev_ctxes_[p];
        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
@@ -109,7 +131,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
  }
 }
-std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; }
+std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -20,17 +20,23 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
+#endif
 namespace paddle {
 namespace framework {
 namespace details {
-struct NCCLAllReduceOpHandle : public OpHandleBase {
+struct AllReduceOpHandle : public OpHandleBase {
-  NCCLAllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
-                        const std::vector<platform::Place> &places,
+  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
-                        const platform::NCCLContextMap &ctxs);
+                    const std::vector<platform::Place> &places,
+                    const platform::NCCLContextMap *ctxs);
+#else
+  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places);
+#endif
  std::string Name() const override;
  // Delay and buffer nccl_all_reduce together can significantly increase
@@ -41,9 +47,11 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
  void RunImpl() override;
 private:
-  const std::vector<Scope *> &local_scopes_;
+  std::vector<Scope *> local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<platform::Place> places_;
-  const platform::NCCLContextMap &nccl_ctxs_;
+#ifdef PADDLE_WITH_CUDA
+  const platform::NCCLContextMap *nccl_ctxs_;
+#endif
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -59,8 +59,8 @@ struct BroadcastOpHandle : public OpHandleBase {
  void RunImpl() override;
 private:
-  const std::vector<Scope *> &local_scopes_;
+  std::vector<Scope *> local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<platform::Place> places_;
 #ifdef PADDLE_WITH_CUDA
  const platform::NCCLContextMap *nccl_ctxs_;
 #endif

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -14,6 +14,8 @@
 #pragma once
+#include <string>
 namespace paddle {
 namespace framework {
 namespace details {
@@ -29,6 +31,8 @@ struct BuildStrategy {
  ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
  GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
+  std::string debug_graphviz_path_{""};
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -20,8 +20,9 @@ namespace details {
 struct ExecutionStrategy {
  size_t num_threads_{0};
-  bool use_event_{true};
+  bool use_cuda_{true};
  bool allow_op_delay_{false};
+  size_t num_iteration_per_drop_scope_{100};
 };
 }  //  namespace details

--- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+void FuseVarsOpHandle::RunImpl() {
+  WaitInputVarGenerated(place_);
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0);
+  PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
+  auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  auto out_var_handle = out_var_handles[0];
+  auto out_var = scope->Var(out_var_handle->name_);
+  auto out_tensor = out_var->GetMutable<LoDTensor>();
+  out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_);
+  int64_t s = 0;
+  for (size_t i = 1; i < out_var_handles.size(); ++i) {
+    auto out_name = out_var_handles[i]->name_;
+    auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>();
+    auto numel = this->inputs_numel_.at(out_name);
+    out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
+    s += numel;
+  }
+  this->RunAndRecordEvent([] {});
+}
+std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.h
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct FuseVarsOpHandle : public OpHandleBase {
+ public:
+  FuseVarsOpHandle(Scope *local_scope, const platform::Place &place,
+                   const std::unordered_map<std::string, int64_t> &inputs_numel,
+                   const std::type_index &var_type)
+      : local_scope_(local_scope),
+        place_(place),
+        inputs_numel_(inputs_numel),
+        type_(var_type) {
+    total_numel_ = 0;
+    for (auto in_numel : inputs_numel) {
+      PADDLE_ENFORCE_GT(in_numel.second, 0);
+      total_numel_ += in_numel.second;
+    }
+  }
+  std::string Name() const override;
+  bool IsMultiDeviceTransfer() override { return false; };
+ protected:
+  void RunImpl() override;
+ private:
+  Scope *local_scope_;
+  const platform::Place place_;
+  const std::unordered_map<std::string, int64_t> inputs_numel_;
+  const std::type_index type_;
+  int64_t total_numel_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -17,6 +17,7 @@
 #include <utility>
 #include <vector>
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
@@ -26,14 +27,6 @@
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
-#endif
-DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot",
-              "the ssa graph path only print with GLOG_v=10,"
-              "default /tmp/graph.dot");
 namespace paddle {
 namespace framework {
 namespace details {
@@ -93,7 +86,7 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
  for (auto *op : program.Block(0).AllOps()) {
    // TODO(Yancey1989): use a graceful method to find send op,
    // instead of the the hard code string
-    if (op->Type() == "send_vars") {
+    if (op->Type() == "send") {
      auto op_vars = op->InputArgumentNames();
      send_vars.reserve(send_vars.size() +
                        std::distance(op_vars.begin(), op_vars.end()));
@@ -247,7 +240,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
                    CreateReduceOp(&result, g_name, 0);
                    CreateBroadcastOp(&result, g_name, 0);
                  } else {
-                    InsertNCCLAllReduceOp(&result, g_name);
+                    InsertAllReduceOp(&result, g_name);
                  }
                  break;
              }
@@ -277,11 +270,6 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   */
  AddOutputToLeafOps(&result);
-  if (VLOG_IS_ON(10)) {
-    std::ofstream fout(FLAGS_ssa_graph_path);
-    PrintGraphviz(*graph, fout);
-  }
  return std::unique_ptr<SSAGraph>(graph);
 }
@@ -295,6 +283,19 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient(
  return false;
 }
+void MultiDevSSAGraphBuilder::SetCommunicationContext(
+    OpHandleBase *op_handle, const platform::Place &p) const {
+#ifdef PADDLE_WITH_CUDA
+  if (nccl_ctxs_ == nullptr) {
+    op_handle->SetDeviceContext(p,
+                                platform::DeviceContextPool::Instance().Get(p));
+  }
+#else
+  op_handle->SetDeviceContext(p,
+                              platform::DeviceContextPool::Instance().Get(p));
+#endif
+}
 void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
                                                const std::string &p_name,
                                                size_t src_dev_id) const {
@@ -309,15 +310,12 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
  op_handle->AddInput(in);
  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &vars = result->vars_.at(i).at(p_name);
    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    auto &vars = result->vars_.at(i).at(p_name);
    auto *out_var = new VarHandle(vars.size(), i, p_name, p);
    vars.emplace_back(out_var);
    op_handle->AddOutput(out_var);
-#ifndef ADDLE_WITH_CUDA
-    op_handle->SetDeviceContext(p,
-                                platform::DeviceContextPool::Instance().Get(p));
-#endif
  }
 }
@@ -329,15 +327,19 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
  CreateOpHandleIOs(result, op, dev_id);
 }
-void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
+void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
-    SSAGraph *result, const std::string &og) const {
+                                                const std::string &og) const {
 #ifdef PADDLE_WITH_CUDA
  result->ops_.emplace_back(
-      new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
+      new AllReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
+#else
+  result->ops_.emplace_back(new AllReduceOpHandle(local_scopes_, places_));
+#endif
  auto *op_handle = result->ops_.back().get();
  for (size_t i = 0; i < places_.size(); ++i) {
    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
    auto &vars = result->vars_[i][og];
    PADDLE_ENFORCE(!vars.empty());
    auto &prev_grad = vars.back();
@@ -347,9 +349,6 @@ void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
    vars.emplace_back(var);
    op_handle->AddOutput(var);
  }
-#else
-  PADDLE_ENFORCE("Not implemented");
-#endif
 }
 bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
@@ -388,7 +387,9 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
  for (size_t i = 0; i < places_.size(); ++i) {
 // Insert ScaleCost OpHandle
 #ifdef PADDLE_WITH_CUDA
-    auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]);
+    auto *communication_dev_ctx =
+        nccl_ctxs_ ? nccl_ctxs_->DevCtx(places_[i])
+                   : platform::DeviceContextPool::Instance().Get(places_[i]);
 #else
    auto *communication_dev_ctx =
        platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
@@ -433,12 +434,9 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
  auto *op_handle = result->ops_.back().get();
  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &vars = result->vars_[i][og];
-#ifndef PADDLE_WITH_CUDA
    auto &p = places_[i];
-    op_handle->SetDeviceContext(p,
+    SetCommunicationContext(op_handle, p);
-                                platform::DeviceContextPool::Instance().Get(p));
+    auto &vars = result->vars_[i][og];
-#endif
    PADDLE_ENFORCE(!vars.empty());
    auto &prev_grad = vars.back();
    op_handle->AddInput(prev_grad.get());
@@ -473,22 +471,21 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
 void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
                                          const OpDesc &op) const {
-  auto &p = places_[0];
+  result->ops_.emplace_back(
-  auto *s = local_scopes_[0];
+      new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0]));
-  result->ops_.emplace_back(new RPCOpHandle(op, s, p, op.Type()));
  if (op.Type() == "send_barrier") {
-    ConnectOp(result, result->ops_.back().get(), "send_vars");
+    ConnectOp(result, result->ops_.back().get(), "send");
  } else if (op.Type() == "recv") {
    ConnectOp(result, result->ops_.back().get(), "send_barrier");
  } else if (op.Type() == "fetch_barrier") {
    ConnectOp(result, result->ops_.back().get(), "recv");
-  } else if (op.Type() == "send_vars") {
+  } else if (op.Type() == "send") {
    // do nothing
  } else {
    PADDLE_THROW(
        "rpc op should be in ["
-        "send_vars, send_barrier. recv, fetch_barrier]");
+        "send, send_barrier. recv, fetch_barrier]");
  }
  // TODO(Yancey1989): schedule rpc op on different place may

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -100,7 +100,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
      const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
      const OpDesc &op) const;
-  void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
+  void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
  void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
                         size_t src_dev_id) const;
@@ -111,6 +111,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 private:
  BuildStrategy strategy_;
+  void SetCommunicationContext(OpHandleBase *op_handle,
+                               const platform::Place &p) const;
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -39,9 +39,9 @@ OpHandleBase::~OpHandleBase() {
 #endif
 }
-void OpHandleBase::Run(bool use_event) {
+void OpHandleBase::Run(bool use_cuda) {
 #ifdef PADDLE_WITH_CUDA
-  if (events_.empty() && use_event) {
+  if (events_.empty() && use_cuda) {
    for (auto &p : dev_ctxes_) {
      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
      PADDLE_ENFORCE(cudaSetDevice(dev_id));
@@ -50,7 +50,7 @@ void OpHandleBase::Run(bool use_event) {
    }
  }
 #else
-  PADDLE_ENFORCE(!use_event);
+  PADDLE_ENFORCE(!use_cuda);
 #endif
  RunImpl();
@@ -104,6 +104,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
  }
 }
+size_t OpHandleBase::NoDummyInputSize() const {
+  size_t cnt = 0;
+  for (auto *in : inputs_) {
+    if (dynamic_cast<DummyVarHandle *>(in) == nullptr) {
+      ++cnt;
+    }
+  }
+  return cnt;
+}
 bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
  return in_var && in_var->generated_op_;
 }

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -36,7 +36,7 @@ class OpHandleBase {
  virtual std::string Name() const = 0;
-  void Run(bool use_event);
+  void Run(bool use_cuda);
  virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx);
@@ -80,6 +80,8 @@ class OpHandleBase {
  const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
+  size_t NoDummyInputSize() const;
 protected:
  void RunAndRecordEvent(const std::function<void()> &callback);

--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -37,7 +37,9 @@ struct ReduceLoDTensor {
    PADDLE_ENFORCE_NE(t0.numel(), 0);
    dst_tensor_.Resize(t0.dims());
    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
-    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+    if (dst != t0.data<T>()) {
+      std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+    }
    for (size_t i = 1; i < src_tensors_.size(); ++i) {
      auto &t = *src_tensors_[i];

--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -32,8 +32,8 @@ namespace framework {
 namespace details {
 struct ReduceOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
+  std::vector<Scope *> local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<platform::Place> places_;
 #ifdef PADDLE_WITH_CUDA
  const platform::NCCLContextMap *nccl_ctxs_;

--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -19,12 +19,12 @@ namespace framework {
 namespace details {
 RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
-                         const Scope *local_scope, const platform::Place &place,
+                         const Scope *local_scope, const std::string &name,
-                         const std::string &name)
+                         const platform::Place &place)
    : op_(framework::OpRegistry::CreateOp(op_desc)),
      local_scope_(local_scope),
-      place_(place),
+      name_(name),
-      name_(name) {}
+      place_(place) {}
 void RPCOpHandle::RunImpl() {
  // TODO(wuyi): need further analysis whether wait VarDummyHandle.

--- a/paddle/fluid/framework/details/rpc_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -29,7 +29,7 @@ namespace details {
 struct RPCOpHandle : public OpHandleBase {
  RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
-              const platform::Place& place, const std::string& name);
+              const std::string& name, const platform::Place& place);
  std::string Name() const override;
@@ -43,8 +43,8 @@ struct RPCOpHandle : public OpHandleBase {
 private:
  std::unique_ptr<OperatorBase> op_;
  const Scope* local_scope_;
-  const platform::Place& place_;
  const std::string name_;
+  platform::Place place_;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+namespace paddle {
+namespace framework {
+namespace details {
+ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
+    ExecutionStrategy strategy, std::vector<Scope *> local_scopes,
+    std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
+    std::unique_ptr<SSAGraphExecutor> &&underlying_executor)
+    : strategy_(std::move(strategy)),
+      underlying_executor_(std::move(underlying_executor)),
+      local_scopes_(std::move(local_scopes)),
+      var_infos_(std::move(var_infos)),
+      places_(std::move(places)) {}
+FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  if (drop_scope_counter_ == 0) {
+    // Create local scopes.
+    for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) {
+      auto &scope = *it;
+      Scope &local_scope = scope->NewScope();
+      *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
+          &local_scope;
+      for (auto &info : var_infos_) {
+        if (scope->FindVar(info.name_) != nullptr) {
+          continue;
+        }
+        if (info.persistable_) {  // Persistable
+          InitializeVariable(scope->Var(info.name_), info.type_);
+        } else {
+          InitializeVariable(local_scope.Var(info.name_), info.type_);
+        }
+      }
+    }
+  }
+  auto fetch_data = underlying_executor_->Run(fetch_tensors);
+  drop_scope_counter_ += 1;
+  if (!fetch_tensors.empty() ||
+      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+    drop_scope_counter_ = 0;
+    // Wait All computational streams
+    for (auto p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
+    for (auto &scope : local_scopes_) {
+      auto &local_scope =
+          *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
+      scope->DeleteScope(local_scope);
+    }
+  }
+  return fetch_data;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct VariableInfo {
+  std::string name_;
+  proto::VarType::Type type_;
+  bool persistable_;
+};
+class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  ScopeBufferedSSAGraphExecutor(
+      ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
+      std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
+      std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
+  FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
+ private:
+  size_t drop_scope_counter_{0};
+  ExecutionStrategy strategy_;
+  std::unique_ptr<SSAGraphExecutor> underlying_executor_;
+  std::vector<Scope*> local_scopes_;
+  std::vector<VariableInfo> var_infos_;
+  std::vector<platform::Place> places_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include <utility>
 namespace paddle {
 namespace framework {
@@ -73,64 +73,6 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
  op_handle->AddOutput(var);
 }
-template <typename Callback>
-void IterAllVar(const SSAGraph &graph, Callback callback) {
-  for (auto &each : graph.vars_) {
-    for (auto &pair1 : each) {
-      for (auto &pair2 : pair1.second) {
-        callback(*pair2);
-      }
-    }
-  }
-  for (auto &var : graph.dep_vars_) {
-    callback(*var);
-  }
-}
-void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
-  size_t var_id = 0;
-  std::unordered_map<const VarHandleBase *, size_t> vars;
-  sout << "digraph G {\n";
-  IterAllVar(graph, [&](const VarHandleBase &var) {
-    auto *var_ptr = &var;
-    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
-    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
-    size_t cur_var_id = var_id++;
-    vars[var_ptr] = cur_var_id;
-    if (var_handle_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
-           << "\\n"
-           << var_handle_ptr->place_ << "\\n"
-           << var_handle_ptr->version_ << "\"]" << std::endl;
-    } else if (dummy_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
-    }
-  });
-  size_t op_id = 0;
-  for (auto &op : graph.ops_) {
-    std::string op_name = "op_" + std::to_string(op_id++);
-    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
-         << std::endl;
-    for (auto in : op->Inputs()) {
-      std::string var_name = "var_" + std::to_string(vars[in]);
-      sout << var_name << " -> " << op_name << std::endl;
-    }
-    for (auto out : op->Outputs()) {
-      std::string var_name = "var_" + std::to_string(vars[out]);
-      sout << op_name << " -> " << var_name << std::endl;
-    }
-  }
-  sout << "}\n";
-}
 void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
  for (auto &op : graph->ops_) {
    if (!op->Outputs().empty()) {

--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -55,8 +55,6 @@ class SSAGraphBuilder {
                             const platform::Place &place, size_t place_offset);
  static void AddOutputToLeafOps(SSAGraph *graph);
-  static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
+#include <fstream>
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/ssa_graph_checker.h"
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+namespace paddle {
+namespace framework {
+namespace details {
+std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
+  std::unique_ptr<SSAGraphBuilder> res(
+#ifdef PADDLE_WITH_CUDA
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, nccl_ctxs_, strategy_)
+#else
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, strategy_)
+#endif
+          );  // NOLINT
+  if (!strategy_.debug_graphviz_path_.empty()) {
+    std::unique_ptr<std::ostream> fout(
+        new std::ofstream(strategy_.debug_graphviz_path_));
+    PADDLE_ENFORCE(fout->good());
+    std::unique_ptr<GraphvizSSAGraphPrinter> graphviz_printer(
+        new GraphvizSSAGraphPrinter());
+    res.reset(new SSAGraghBuilderWithPrinter(
+        std::move(fout), std::move(graphviz_printer), std::move(res)));
+  }
+  res.reset(new SSAGraghBuilderWithChecker(std::move(res)));
+  return res;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+namespace paddle {
+namespace framework {
+class Scope;
+namespace details {
+class SSAGraphBuilderFactory {
+ public:
+  SSAGraphBuilderFactory(const std::vector<platform::Place>& places,
+                         const std::string& loss_var_name,
+                         const std::unordered_set<std::string>& param_names,
+                         const std::vector<Scope*>& local_scopes,
+                         const BuildStrategy& strategy)
+      : places_(places),
+        loss_var_name_(loss_var_name),
+        param_names_(param_names),
+        local_scopes_(local_scopes),
+        strategy_(strategy) {
+#ifdef PADDLE_WITH_CUDA
+    nccl_ctxs_ = nullptr;
+#endif
+  }
+#ifdef PADDLE_WITH_CUDA
+  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {
+    nccl_ctxs_ = nccl_ctxs;
+  }
+#endif
+  std::unique_ptr<SSAGraphBuilder> Create();
+ private:
+  std::vector<platform::Place> places_;
+  std::string loss_var_name_;
+  std::unordered_set<std::string> param_names_;
+  std::vector<Scope*> local_scopes_;
+  BuildStrategy strategy_;
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap* nccl_ctxs_;
+#endif
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_checker.cc
+++ b/paddle/fluid/framework/details/ssa_graph_checker.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/ssa_graph.h"
+#include <string>
+#include "paddle/fluid/framework/details/ssa_graph_checker.h"
+namespace paddle {
+namespace framework {
+namespace details {
+bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
+  std::unordered_map<OpHandleBase *, size_t> pending_ops;
+  std::unordered_set<VarHandleBase *> pending_vars;
+  std::unordered_set<VarHandleBase *> ready_vars;
+  std::unordered_set<OpHandleBase *> ready_ops;
+  auto insert_pending_var = [&](VarHandleBase *var) {
+    pending_vars.insert(var);
+    if (var->generated_op_ == nullptr) {
+      ready_vars.emplace(var);
+    }
+  };
+  for (auto &var_map : graph->vars_) {
+    for (auto &name_pair : var_map) {
+      for (auto &version_pair : name_pair.second) {
+        insert_pending_var(version_pair.get());
+      }
+    }
+  }
+  for (auto &var : graph->dep_vars_) {
+    insert_pending_var(var.get());
+  }
+  for (auto &op : graph->ops_) {
+    if (op->Inputs().empty()) {
+      ready_ops.insert(op.get());
+    } else {
+      pending_ops.insert({op.get(), op.get()->NoDupInputSize()});
+    }
+  }
+  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+    for (auto *op : set) {
+      for (auto out : op->Outputs()) {
+        ready_vars.emplace(out);
+      }
+    }
+    set.clear();
+  };
+  while (!pending_vars.empty()) {
+    run_all_ops(ready_ops);
+    if (ready_vars.empty()) {
+      return false;
+    }
+    for (auto ready_var : ready_vars) {
+      pending_vars.erase(ready_var);
+      for (auto *op : ready_var->pending_ops_) {
+        auto &deps = --pending_ops[op];
+        if (deps == 0) {
+          ready_ops.insert(op);
+        }
+      }
+    }
+    ready_vars.clear();
+  }
+  return true;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct SSAGraph;
+class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
+ public:
+  explicit SSAGraghBuilderWithChecker(
+      std::unique_ptr<SSAGraphBuilder>&& builder)
+      : builder_(std::move(builder)) {}
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
+    auto graph = builder_->Build(program);
+    PADDLE_ENFORCE(IsValidGraph(graph.get()));
+    return graph;
+  }
+  bool IsValidGraph(const SSAGraph* graph) const;
+ private:
+  std::unique_ptr<SSAGraphBuilder> builder_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -17,10 +17,6 @@
 namespace paddle {
 namespace framework {
 namespace details {
-SSAGraphExecutor::SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph)
-    : graph_(std::move(graph)) {}
 SSAGraphExecutor::~SSAGraphExecutor() {}
 }  // namespace details

--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -28,15 +28,11 @@ class SSAGraphExecutor {
  DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
 public:
-  // Steal graph inside
+  SSAGraphExecutor() {}
-  explicit SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph);
  virtual ~SSAGraphExecutor();
  virtual FeedFetchList Run(const std::vector<std::string> &fetch_tensors) = 0;
- protected:
-  std::unique_ptr<SSAGraph> graph_;
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/ssa_graph_printer.cc
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+#include <string>
+#include "paddle/fluid/framework/details/ssa_graph.h"
+namespace paddle {
+namespace framework {
+namespace details {
+template <typename Callback>
+static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
+  for (auto &each : graph.vars_) {
+    for (auto &pair1 : each) {
+      for (auto &pair2 : pair1.second) {
+        callback(*pair2);
+      }
+    }
+  }
+  for (auto &var : graph.dep_vars_) {
+    callback(*var);
+  }
+}
+void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
+                                    std::ostream &sout) const {
+  size_t var_id = 0;
+  std::unordered_map<const VarHandleBase *, size_t> vars;
+  sout << "digraph G {\n";
+  IterAllVar(graph, [&](const VarHandleBase &var) {
+    auto *var_ptr = &var;
+    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
+    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
+    size_t cur_var_id = var_id++;
+    vars[var_ptr] = cur_var_id;
+    if (var_handle_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
+           << "\\n"
+           << var_handle_ptr->place_ << "\\n"
+           << var_handle_ptr->version_ << "\"]" << std::endl;
+    } else if (dummy_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
+    }
+  });
+  size_t op_id = 0;
+  for (auto &op : graph.ops_) {
+    std::string op_name = "op_" + std::to_string(op_id++);
+    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
+         << std::endl;
+    for (auto in : op->Inputs()) {
+      std::string var_name = "var_" + std::to_string(vars[in]);
+      sout << var_name << " -> " << op_name << std::endl;
+    }
+    for (auto out : op->Outputs()) {
+      std::string var_name = "var_" + std::to_string(vars[out]);
+      sout << op_name << " -> " << var_name << std::endl;
+    }
+  }
+  sout << "}\n";
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <iosfwd>
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct SSAGraph;
+class SSAGraphPrinter {
+ public:
+  virtual ~SSAGraphPrinter() {}
+  virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0;
+};
+class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
+ public:
+  void Print(const SSAGraph& graph, std::ostream& sout) const override;
+};
+class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
+ public:
+  SSAGraghBuilderWithPrinter(std::ostream& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ref_(sout) {}
+  SSAGraghBuilderWithPrinter(std::unique_ptr<std::ostream>&& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ptr_(std::move(sout)),
+        stream_ref_(*stream_ptr_) {}
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
+    auto graph = builder_->Build(program);
+    printer_->Print(*graph, stream_ref_);
+    return graph;
+  }
+ private:
+  std::unique_ptr<SSAGraphPrinter> printer_;
+  std::unique_ptr<SSAGraphBuilder> builder_;
+  std::unique_ptr<std::ostream> stream_ptr_;
+  std::ostream& stream_ref_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -21,7 +21,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places,
    std::unique_ptr<SSAGraph> &&graph)
-    : SSAGraphExecutor(std::move(graph)),
+    : graph_(std::move(graph)),
      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                       : nullptr),
      local_scopes_(local_scopes),
@@ -185,12 +185,15 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
    ready_vars->Push(var);
  }
 }
 void ThreadedSSAGraphExecutor::RunOp(
    BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
  auto op_run = [ready_var_q, op, this] {
    try {
-      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
+      if (VLOG_IS_ON(10)) {
-      op->Run(strategy_.use_event_);
+        VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
+      }
+      op->Run(strategy_.use_cuda_);
      VLOG(10) << op << " " << op->Name() << " Done ";
      running_ops_--;
      ready_var_q->Extend(op->Outputs());

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -51,6 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
             details::OpHandleBase *op);
 private:
+  std::unique_ptr<SSAGraph> graph_;
  std::unique_ptr<::ThreadPool> pool_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -20,10 +20,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#endif
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 DECLARE_bool(benchmark);
+DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
 namespace paddle {
 namespace framework {
@@ -43,6 +47,14 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 Executor::Executor(const platform::Place& place) : place_(place) {}
+#ifdef PADDLE_WITH_DISTRIBUTE
+void Executor::Complete() {
+  ::paddle::operators::detail::RPCClient::GetInstance<
+      ::paddle::operators::detail::GRPCClient>()
+      ->SendComplete();
+}
+#endif
 void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
  if (var_type == proto::VarType::LOD_TENSOR) {
    var->GetMutable<LoDTensor>();
@@ -115,6 +127,7 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                   bool create_local_scope, bool create_vars) {
  platform::RecordBlock b(block_id);
+  if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
  auto ctx = Prepare(pdesc, block_id);
  RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
@@ -214,16 +227,18 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
                   const std::string& feed_holder_name,
                   const std::string& fetch_holder_name) {
  platform::RecordBlock b(kProgramId);
+  if (FLAGS_use_mkldnn) EnableMKLDNN(program);
  bool has_feed_ops =
      has_feed_operators(program.Block(0), *feed_targets, feed_holder_name);
  bool has_fetch_ops =
      has_fetch_operators(program.Block(0), *fetch_targets, fetch_holder_name);
  ProgramDesc* copy_program = const_cast<ProgramDesc*>(&program);
+  std::unique_ptr<ProgramDesc> unique_ptr_of_copy_program;
  if (!has_feed_ops || !has_fetch_ops) {
-    copy_program = std::unique_ptr<ProgramDesc>(new ProgramDesc(program)).get();
+    unique_ptr_of_copy_program.reset(new ProgramDesc(program));
+    copy_program = unique_ptr_of_copy_program.get();
  }
  auto* global_block = copy_program->MutableBlock(0);
  if (!has_feed_ops) {
@@ -376,5 +391,19 @@ void Executor::RunPreparedContext(
  }
 }
+void Executor::EnableMKLDNN(const ProgramDesc& program) {
+#ifdef PADDLE_WITH_MKLDNN
+  VLOG(3) << "use_mkldnn=True";
+  for (size_t bid = 0; bid < program.Size(); ++bid) {
+    auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
+    for (auto* op : block->AllOps()) {
+      if (op->HasAttr("use_mkldnn")) {
+        op->SetAttr("use_mkldnn", true);
+      }
+    }
+  }
+#endif
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -44,6 +44,13 @@ class Executor {
  explicit Executor(const platform::Place& place);
+#ifdef PADDLE_WITH_DISTRIBUTE
+  /*
+   * Sending signal to pserver to mark current trainer stop.
+   */
+  void Complete();
+#endif
  /* @Brief
   * Runtime evaluation of the given ProgramDesc under certain Scope
   *
@@ -81,6 +88,8 @@ class Executor {
                          const std::string& feed_holder_name = "feed",
                          const std::string& fetch_holder_name = "fetch");
+  void EnableMKLDNN(const ProgramDesc& program);
 private:
  const platform::Place place_;
 };

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -71,6 +71,7 @@ message OpProto {
    optional bool duplicable = 3 [ default = false ];
    optional bool intermediate = 4 [ default = false ];
    optional bool dispensable = 5 [ default = false ];
+    optional string reuse = 6;
  }
  // AttrProto describes the C++ type Attribute.

--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
@@ -17,12 +17,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-static OpInfoMap* g_op_info_map = nullptr;
+// C++11 removes the need for manual locking. Concurrent execution shall wait if
+// a static local variable is already being initialized.
+// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 OpInfoMap& OpInfoMap::Instance() {
-  if (g_op_info_map == nullptr) {
+  static OpInfoMap* g_op_info_map = new OpInfoMap();
-    g_op_info_map = new OpInfoMap();
-  }
  return *g_op_info_map;
 }
 }  // namespace framework

--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -87,7 +87,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
 }
 inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
-  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
+  bool ret =
+      (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r);
+#ifdef PADDLE_WITH_MKLDNN
+  // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa
+  ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN);
+  ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN);
+#endif
+  return ret;
 }
 inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {

--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -21,6 +21,7 @@ namespace framework {
 void OpProtoAndCheckerMaker::Validate() {
  validated_ = true;
  CheckNoDuplicatedInOutAttrs();
+  CheckReuseVars();
 }
 OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
@@ -56,6 +57,24 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
  }
 }
+void OpProtoAndCheckerMaker::CheckReuseVars() {
+  std::unordered_set<std::string> names;
+  for (auto& input : proto_->inputs()) {
+    names.insert(input.name());
+  }
+  auto checker = [&](const std::string& name, const std::string& reused) {
+    PADDLE_ENFORCE(
+        names.count(reused),
+        "Output [%s] reuse Input [%s], but the input is not registered.", name,
+        reused);
+  };
+  for (auto& output : proto_->outputs()) {
+    if (output.has_reuse()) {
+      checker(output.name(), output.reuse());
+    }
+  }
+}
 void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
                                        OpAttrChecker* attr_checker) {
  proto_ = proto;

--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -14,6 +14,8 @@ limitations under the License. */
 #pragma once
 #include <string>
+#include <unordered_set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -64,6 +66,11 @@ class OpProtoAndCheckerMaker {
      var_->set_dispensable(true);
      return *this;
    }
+    VariableBuilder &Reuse(const std::string &name) {
+      var_->set_reuse(name);
+      return *this;
+    }
  };
  VariableBuilder AddInput(const std::string &name, const std::string &comment);
@@ -89,6 +96,8 @@ class OpProtoAndCheckerMaker {
  void CheckNoDuplicatedInOutAttrs();
  void Validate();
+  void CheckReuseVars();
  proto::OpProto *proto_;
  OpAttrChecker *op_checker_;
  bool validated_{false};

--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -47,3 +47,23 @@ TEST(ProtoMaker, DuplicatedInOut) {
  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
               paddle::platform::EnforceNotMet);
 }
+class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddOutput("XOut", "output of test op").Reuse("X");
+    AddOutput("NoOut", "output of test op").Reuse("NotExists");
+  }
+};
+TEST(ProtoMaker, InplaceOutput) {
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  TestInplaceProtoMaker proto_maker;
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+               paddle::platform::EnforceNotMet);
+  // proto_maker(&op_proto, &op_checker);
+  // proto_maker.Make();
+  // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+}
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -83,8 +83,14 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
  void operator()(const char* op_type, const char* library_type) const {
    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
+    std::string library(library_type);
+    std::string data_layout = "ANYLAYOUT";
+    if (library == "MKLDNN") {
+      data_layout = "MKLDNNLAYOUT";
+    }
    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
-                     DataLayout::kAnyLayout, StringToLibraryType(library_type));
+                     StringToDataLayout(data_layout),
+                     StringToLibraryType(library_type));
    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
@@ -99,7 +105,8 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
  void operator()(const char* op_type, const char* library_type) const {}
 };
-// User can register many kernel in one place. The data type could be different.
+// User can register many kernel in one place. The data type could be
+// different.
 template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
 public:
@@ -149,15 +156,15 @@ class OpKernelRegistrar : public Registrar {
 /**
 * Macro to register OperatorKernel.
 */
-#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...)        \
+#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)        \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__,                      \
+      __reg_op_kernel_##op_type##_##library_type##__,                      \
      "REGISTER_OP_KERNEL must be called in global namespace");            \
  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
-      __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type,       \
+      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,       \
-                                                           #LIBRARY_TYPE); \
+                                                           #library_type); \
-  int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() {                \
+  int TouchOpKernelRegistrar_##op_type##_##library_type() {                \
-    __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch();          \
+    __op_kernel_registrar_##op_type##_##library_type##__.Touch();          \
    return 0;                                                              \
  }

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -293,6 +293,38 @@ static Tensor* GetMutableTensorFromVar(Variable* var) {
  }
 }
+bool ExecutionContext::HasInput(const std::string& name) const {
+  if (!op_.HasInputs(name)) {
+    return false;
+  }
+  auto& ins = Inputs(name);
+  size_t length = ins.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input %s should not have more than one inputs", name);
+  auto arg = ins[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
+bool ExecutionContext::HasOutput(const std::string& name) const {
+  if (!op_.HasOutputs(name)) {
+    return false;
+  }
+  auto& outs = Outputs(name);
+  size_t length = outs.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Output %s should not have more than one inputs", name);
+  auto arg = outs[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
  auto* var = InputVar(name);
@@ -444,10 +476,25 @@ class RuntimeInferShapeContext : public InferShapeContext {
    auto* out_tensor = out_var->GetMutable<LoDTensor>();
    out_tensor->set_lod(in_tensor.lod());
-    // TODO(dzhwinter) : reuse ShareLoD in most operators.
+// TODO(dzhwinter) : reuse ShareLoD in most operators.
-    // Need to call ShareLayout explicitly in sequence related ops.
+// Need to call ShareLayout explicitly in sequence related ops.
-    // Shall we have a better method to shared info between in/out Tensor?
+// Shall we have a better method to shared info between in/out Tensor?
-    out_tensor->set_layout(in_tensor.layout());
+#ifdef PADDLE_WITH_MKLDNN
+    // Fix me: ugly workaround below
+    // Correct solution:
+    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
+    //    layout of output tensor should be set "manually" in Compute()
+    //    of each OPKernel. The reason layout should NOT be shared between
+    //    input and output "automatically" (now by InferShape()->ShareLoD())
+    //    is that layout transform may occur after InferShape().
+    // Workaround:
+    //    Skip set_layout() when input layout is kMKLDNN
+    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
+    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
+    //    in Compute()
+    if (in_tensor.layout() != DataLayout::kMKLDNN)
+#endif
+      out_tensor->set_layout(in_tensor.layout());
  }
  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
@@ -646,8 +693,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
        }
        if (t != nullptr) {
          int tmp = static_cast<int>(ToDataType(t->type()));
-          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+          PADDLE_ENFORCE(
-                         "DataType of Paddle Op %s must be the same.", Type());
+              tmp == data_type || data_type == -1,
+              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
+              data_type, tmp);
          data_type = tmp;
        }
      }
@@ -665,7 +714,8 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
 OpKernelType OperatorWithKernel::GetKernelTypeForVar(
    const std::string& var_name, const Tensor& tensor,
    const OpKernelType& expected_kernel_type) const {
-  return OpKernelType(expected_kernel_type.data_type_, tensor.place());
+  return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
+                      tensor.layout());
 }
 }  // namespace framework

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -191,9 +191,9 @@ class ExecutionContext {
    return op_.Attr<T>(name);
  }
-  bool HasInput(const std::string& name) const { return op_.HasInputs(name); }
+  bool HasInput(const std::string& name) const;
-  bool HasOutput(const std::string& name) const { return op_.HasOutputs(name); }
+  bool HasOutput(const std::string& name) const;
  size_t InputSize(const std::string& name) const {
    return op_.Inputs(name).size();

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/fluid/operators/arg_max_op.cu
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
--- a/paddle/fluid/operators/arg_min_op.cu
+++ b/paddle/fluid/operators/arg_min_op.cu
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
--- a/paddle/fluid/operators/detail/brpc_client.cc
+++ b/paddle/fluid/operators/detail/brpc_client.cc
--- a/paddle/fluid/operators/detail/brpc_client.h
+++ b/paddle/fluid/operators/detail/brpc_client.h
--- a/paddle/fluid/operators/detail/brpc_server.cc
+++ b/paddle/fluid/operators/detail/brpc_server.cc
--- a/paddle/fluid/operators/detail/brpc_server.h
+++ b/paddle/fluid/operators/detail/brpc_server.h
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
--- a/paddle/fluid/operators/detail/serde_test.cc
+++ b/paddle/fluid/operators/detail/serde_test.cc
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
--- a/paddle/fluid/operators/detail/macros.h
+++ b/paddle/fluid/operators/detail/macros.h
--- a/paddle/fluid/operators/detail/request_handler.h
+++ b/paddle/fluid/operators/detail/request_handler.h
--- a/paddle/fluid/operators/detail/request_handler_impl.cc
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
--- a/paddle/fluid/operators/detail/request_handler_impl.h
+++ b/paddle/fluid/operators/detail/request_handler_impl.h
--- a/paddle/fluid/operators/detail/rpc_client.cc
+++ b/paddle/fluid/operators/detail/rpc_client.cc
--- a/paddle/fluid/operators/detail/rpc_client.h
+++ b/paddle/fluid/operators/detail/rpc_client.h
--- a/paddle/fluid/operators/detail/rpc_server.cc
+++ b/paddle/fluid/operators/detail/rpc_server.cc
--- a/paddle/fluid/operators/detail/rpc_server.h
+++ b/paddle/fluid/operators/detail/rpc_server.h
--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
--- a/paddle/fluid/operators/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_max_op.cc
--- a/paddle/fluid/operators/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_max_op.cu
--- a/paddle/fluid/operators/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_mean_op.cc
--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
--- a/paddle/fluid/operators/reduce_mean_op.h
+++ b/paddle/fluid/operators/reduce_mean_op.h
--- a/paddle/fluid/operators/reduce_min_max_op.h
+++ b/paddle/fluid/operators/reduce_min_max_op.h
--- a/paddle/fluid/operators/reduce_min_op.cc
+++ b/paddle/fluid/operators/reduce_min_op.cc
--- a/paddle/fluid/operators/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_min_op.cu
--- a/paddle/fluid/operators/reduce_op.cc
+++ b/paddle/fluid/operators/reduce_op.cc
--- a/paddle/fluid/operators/reduce_op.cu
+++ b/paddle/fluid/operators/reduce_op.cu
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
--- a/paddle/fluid/operators/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_op_function.h
--- a/paddle/fluid/operators/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_prod_op.cc
--- a/paddle/fluid/operators/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_prod_op.cu
--- a/paddle/fluid/operators/reduce_prod_op.h
+++ b/paddle/fluid/operators/reduce_prod_op.h
--- a/paddle/fluid/operators/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_sum_op.cc
--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
--- a/paddle/fluid/operators/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_sum_op.h
--- a/paddle/fluid/operators/reverse_op.cc
+++ b/paddle/fluid/operators/reverse_op.cc
--- a/paddle/fluid/operators/reverse_op.cu
+++ b/paddle/fluid/operators/reverse_op.cu
--- a/paddle/fluid/operators/reverse_op.h
+++ b/paddle/fluid/operators/reverse_op.h
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
--- a/paddle/fluid/recordio/scanner.h
+++ b/paddle/fluid/recordio/scanner.h
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
--- a/python/paddle/fluid/tests/test_concurrency.py
+++ b/python/paddle/fluid/tests/test_concurrency.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/benchmark.py
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
--- a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
--- a/python/paddle/fluid/tests/unittests/test_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
+++ b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_reverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
--- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_var.py
+++ b/python/paddle/fluid/tests/unittests/test_split_var.py
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
--- a/python/paddle/fluid/tests/unittests/transpiler_test.py
+++ b/python/paddle/fluid/tests/unittests/transpiler_test.py
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler_simple.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler_simple.py
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
--- a/tools/codestyle/.gitignore
+++ b/tools/codestyle/.gitignore
--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
--- a/tools/codestyle/pylint_pre_commit.hook
+++ b/tools/codestyle/pylint_pre_commit.hook