merge develop branch

ebfe5a02 · dzhwinter · c8adc2c6 · 3c957af1 · ebfe5a02 · ebfe5a02
1000 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -25,5 +25,7 @@ third_party/
 bazel-*
 third_party/

+build_*
 # clion workspace.
 cmake-build-*
+model_test
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,7 +12,6 @@ services:
 os:
  - linux
 env:
-  - JOB=doc
  - JOB=check_style
  - JOB=build_android
 addons:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,8 +70,11 @@ option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_INFERENCE    "Compile fluid inference library"              ON)
+option(ON_INFER         "Turn on inference optimization."               OFF)
+option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
+option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)

 # PY_VERSION
 if(NOT PY_VERSION)
@@ -126,6 +129,9 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
 set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
  "A path setting fluid shared and static libraries")

+set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
+  "A path setting fluid inference shared and static libraries")
+
 if (WITH_C_API AND WITH_PYTHON)
  message(WARNING "It is suggest not embedded a python interpreter in Paddle "
    "when using C-API. It will give an unpredictable behavior when using a "
@@ -175,6 +181,7 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
+include(external/xxhash)    # download xxhash

 if (NOT WIN32)
 # there is no official support of snappystream, warpctc, nccl, cupti in windows
@@ -213,9 +220,11 @@ include(configure)          # add paddle env configuration
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
+endif()
+if(WITH_MKL OR WITH_MKLML)
    include(external/anakin)
 elseif()
-    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
+    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
 endif()

 include(flags)              # set paddle compile flags
@@ -297,3 +306,11 @@ if(WITH_DOC)
    find_python_module(recommonmark REQUIRED)
    add_subdirectory(doc)
 endif()
+
+if (ON_INFER)
+    message(STATUS "On inference mode, will take place some specific optimization.")
+    add_definitions(-DPADDLE_ON_INFERENCE)
+else()
+    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
+    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
+endif()
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,6 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/

 RUN apt-get update && \
    apt-get install -y --allow-downgrades patchelf \
+    python3 python3-dev python3-pip \
    git python-pip python-dev python-opencv openssh-server bison \
    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
@@ -53,7 +54,7 @@ RUN curl -s -q https://glide.sh/get | sh
 #    and its size is only one-third of the official one.
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
    tar -xz -C /usr/local && \
    cp -rf /usr/local/TensorRT/include /usr && \
    cp -rf /usr/local/TensorRT/lib /usr
@@ -70,24 +71,33 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN easy_install -U pip && \
-    pip install -U wheel && \
+RUN pip3 install -U wheel && \
+    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    easy_install -U pip && \
+    pip install -U pip setuptools wheel && \
    pip install -U docopt PyYAML sphinx==1.5.6 && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark

-RUN pip install pre-commit 'ipython==5.3.0' && \
+RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3 install opencv-python && \
+    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
    pip install opencv-python

 #For docstring checker
+RUN pip3 install pylint pytest astroid isort
 RUN pip install pylint pytest astroid isort LinkChecker

 COPY ./python/requirements.txt /root/
+RUN pip3 install -r /root/requirements.txt
 RUN pip install -r /root/requirements.txt

 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev
+RUN pip3 install certifi urllib3[secure]
 RUN pip install certifi urllib3[secure]



--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@


 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.0/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)

@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.


-### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0)
+### Latest PaddlePaddle Release: [Fluid 1.0.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -27,9 +27,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==0.14.0.post87
+pip install paddlepaddle-gpu==1.0.1.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==0.14.0.post85
+pip install paddlepaddle-gpu==1.0.1.post85

 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -76,33 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85

 ## Installation

-It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html)
-before looking into the
-[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html).
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) on our website.

 ## Documentation

-We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
-[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.0.0/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) documentation.

- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)
+- [Deep Learning 101](https://github.com/PaddlePaddle/book)

  You might want to start from this online interactive book that can run in a Jupyter Notebook.

- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.0/user_guides/howto/training/cluster_howto.html)

  You can run distributed training jobs on MPI clusters.

- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html)
-
-   You can also run distributed training jobs on Kubernetes clusters.
-
- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/1.0/fluid.html)

   Our new API enables much shorter programs.

- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.0/advanced_usage/development/contribute_to_paddle.html)

   We appreciate your contributions!


--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -11,6 +11,7 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
 # exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...

+
 RUN pip install -U pip
 RUN pip install -U kubernetes paddlepaddle

@@ -27,5 +28,6 @@ ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl 

 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
+ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
 ADD models/ /workspace/models/
+
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -17,7 +17,8 @@ import argparse
 __all__ = ['parse_args', ]

 BENCHMARK_MODELS = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "se_resnext", "vgg", "mnist",
+    "stacked_dynamic_lstm", "resnet_with_preprocess"
 ]


@@ -67,12 +68,12 @@ def parse_args():
        '--cpus',
        type=int,
        default=1,
-        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+        help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
    parser.add_argument(
        '--data_set',
        type=str,
        default='flowers',
-        choices=['cifar10', 'flowers'],
+        choices=['cifar10', 'flowers', 'imagenet'],
        help='Optional dataset for benchmark.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
@@ -122,6 +123,11 @@ def parse_args():
        type=str,
        default="",
        help='Directory that contains all the training recordio files.')
+    parser.add_argument(
+        '--test_data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the test data (NOT recordio).')
    parser.add_argument(
        '--use_inference_transpiler',
        action='store_true',
@@ -130,5 +136,16 @@ def parse_args():
        '--no_random',
        action='store_true',
        help='If set, keep the random seed and do not shuffle the data.')
+    parser.add_argument(
+        '--reduce_strategy',
+        type=str,
+        choices=['reduce', 'all_reduce'],
+        default='all_reduce',
+        help='Specify the reduce strategy, can be reduce, all_reduce')
+    parser.add_argument(
+        '--fuse_broadcast_op',
+        action='store_true',
+        help='If set, would fuse multiple broadcast operators into one fused_broadcast operator.'
+    )
    args = parser.parse_args()
    return args
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -16,6 +16,7 @@ import argparse
 import cProfile
 import time
 import os
+import traceback

 import numpy as np

@@ -27,7 +28,7 @@ import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
 from args import *


-def append_nccl2_prepare(trainer_id):
+def append_nccl2_prepare(trainer_id, startup_prog):
    if trainer_id >= 0:
        # append gen_nccl_id at the end of startup program
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
@@ -40,11 +41,11 @@ def append_nccl2_prepare(trainer_id):
        current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
        worker_endpoints.remove(current_endpoint)

-        nccl_id_var = fluid.default_startup_program().global_block().create_var(
+        nccl_id_var = startup_prog.global_block().create_var(
            name="NCCLID",
            persistable=True,
            type=fluid.core.VarDesc.VarType.RAW)
-        fluid.default_startup_program().global_block().append_op(
+        startup_prog.global_block().append_op(
            type="gen_nccl_id",
            inputs={},
            outputs={"NCCLID": nccl_id_var},
@@ -59,7 +60,7 @@ def append_nccl2_prepare(trainer_id):
                        "nccl-based dist train.")


-def dist_transpile(trainer_id, args):
+def dist_transpile(trainer_id, args, train_prog, startup_prog):
    if trainer_id < 0:
        return None, None

@@ -80,133 +81,70 @@ def dist_transpile(trainer_id, args):
    # the role, should be either PSERVER or TRAINER
    training_role = os.getenv("PADDLE_TRAINING_ROLE")

-    t = distribute_transpiler.DistributeTranspiler()
+    config = distribute_transpiler.DistributeTranspilerConfig()
+    config.slice_var_up = not args.no_split_var
+    t = distribute_transpiler.DistributeTranspiler(config=config)
    t.transpile(
        trainer_id,
+        # NOTE: *MUST* use train_prog, for we are using with guard to
+        # generate different program for train and test.
+        program=train_prog,
        pservers=pserver_endpoints,
        trainers=trainers,
-        sync_mode=not args.async_mode)
+        sync_mode=not args.async_mode,
+        startup_program=startup_prog)
    if training_role == "PSERVER":
        pserver_program = t.get_pserver_program(current_endpoint)
-        pserver_startup_program = t.get_startup_program(current_endpoint,
-                                                        pserver_program)
+        pserver_startup_program = t.get_startup_program(
+            current_endpoint, pserver_program, startup_program=startup_prog)
        return pserver_program, pserver_startup_program
    elif training_role == "TRAINER":
        train_program = t.get_trainer_program()
-        return train_program, fluid.default_startup_program()
+        return train_program, startup_prog
    else:
        raise ValueError(
            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
        )


-def test(exe, inference_program, test_reader, feeder, batch_acc):
-    accuracy_evaluator = fluid.metrics.Accuracy()
-    for batch_id, data in enumerate(test_reader()):
-        acc = exe.run(inference_program,
-                      feed=feeder.feed(data),
-                      fetch_list=[batch_acc])
-        accuracy_evaluator.update(value=np.array(acc), weight=len(data))
+def test_parallel(exe, test_args, args, test_prog, feeder):
+    acc_evaluators = []
+    for i in xrange(len(test_args[2])):
+        acc_evaluators.append(fluid.metrics.Accuracy())

-    return accuracy_evaluator.eval()
-
-
-# TODO(wuyi): replace train, train_parallel, test functions with new trainer
-# API once it is ready.
-def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
-          args, train_prog, startup_prog):
-    if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
-        place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-        exe.run(train_prog)
-        return
-
-    if args.use_fake_data:
-        raise Exception(
-            "fake data is not supported in single GPU test for now.")
-
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(startup_prog)
-
-    # Use inference_transpiler to speedup
-    if not args.use_reader_op:
-        feed_var_list = [
-            var for var in train_prog.global_block().vars.itervalues()
-            if var.is_data
-        ]
-        feeder = fluid.DataFeeder(feed_var_list, place)
-
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        train_losses = []
-        if not args.use_reader_op:
-            reader_generator = train_reader()
-        batch_id = 0
-        data = None
+    to_fetch = [v.name for v in test_args[2]]
+    if args.use_reader_op:
+        test_args[4].start()
        while True:
-            if not args.use_reader_op:
-                data = next(reader_generator, None)
-                if data == None:
-                    break
-            if iters == args.iterations:
-                reader_generator.close()
+            try:
+                acc_rets = exe.run(fetch_list=to_fetch)
+                for i, e in enumerate(acc_evaluators):
+                    e.update(
+                        value=np.array(acc_rets[i]), weight=args.batch_size)
+            except fluid.core.EOFException as eof:
+                test_args[4].reset()
                break
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
+    else:
+        for batch_id, data in enumerate(test_args[3]()):
+            acc_rets = exe.run(feed=feeder.feed(data), fetch_list=to_fetch)
+            for i, e in enumerate(acc_evaluators):
+                e.update(value=np.array(acc_rets[i]), weight=len(data))

-            if args.use_reader_op:
-                try:
-                    loss = exe.run(train_prog, fetch_list=[avg_loss])
-                except fluid.core.EnforceNotMet as ex:
-                    break
-            else:
-                loss = exe.run(train_prog,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_loss])
-            iters += 1
-            batch_id += 1
-            # FIXME(wuyi): For use_reader_op, if the current
-            # pass is not the last, the last batch of this pass
-            # is also equal to args.batch_size.
-            if args.use_reader_op:
-                num_samples += args.batch_size * args.gpus
-            else:
-                num_samples += len(data)
-            train_losses.append(loss)
-            print("Pass: %d, Iter: %d, Loss: %f\n" %
-                  (pass_id, iters, np.mean(train_losses)))
-        print_train_time(start_time, time.time(), num_samples)
-        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
-        # evaluation
-        if not args.no_test and batch_acc and not args.use_reader_op:
-            if args.use_inference_transpiler:
-                t = fluid.InferenceTranspiler()
-                t.transpile(infer_prog, place)
-
-            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
-                                 batch_acc)
-            print(", Test Accuracy: %f" % pass_test_acc)
-        print("\n")
-        # TODO(wuyi): add warmup passes to get better perf data.
-        exit(0)
+    return [e.eval() for e in acc_evaluators]


-# TODO(wuyi): replace train, train_parallel, test functions with new trainer
-# API once it is ready.
-def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
-                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
-                   num_trainers, trainer_id):
+# NOTE: only need to benchmark using parallelexe
+def train_parallel(train_args, test_args, args, train_prog, test_prog,
+                   startup_prog, nccl_id_var, num_trainers, trainer_id):
+    over_all_start = time.time()
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    feeder = None
    if not args.use_reader_op:
        feed_var_list = [
            var for var in train_prog.global_block().vars.itervalues()
            if var.is_data
        ]
        feeder = fluid.DataFeeder(feed_var_list, place)
-
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
@@ -230,63 +168,119 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
    startup_exe = fluid.Executor(place)
    startup_exe.run(startup_prog)
    strategy = fluid.ExecutionStrategy()
-    strategy.num_threads = 1
+    strategy.num_threads = args.cpus
    strategy.allow_op_delay = False
+    build_strategy = fluid.BuildStrategy()
+    if args.reduce_strategy == "reduce":
+        build_strategy.reduce_strategy = fluid.BuildStrategy(
+        ).ReduceStrategy.Reduce
+    else:
+        build_strategy.reduce_strategy = fluid.BuildStrategy(
+        ).ReduceStrategy.AllReduce
+    build_strategy.fuse_broadcast_op = args.fuse_broadcast_op
+
+    avg_loss = train_args[0]
+
+    if args.update_method == "pserver":
+        # parameter server mode distributed training, merge
+        # gradients on local server, do not initialize
+        # ParallelExecutor with multi server all-reduce mode.
+        num_trainers = 1
+        trainer_id = 0
+
    exe = fluid.ParallelExecutor(
        True,
        avg_loss.name,
+        main_program=train_prog,
        exec_strategy=strategy,
+        build_strategy=build_strategy,
        num_trainers=num_trainers,
        trainer_id=trainer_id)

+    if not args.no_test:
+        if args.update_method == "pserver":
+            test_scope = None
+        else:
+            # NOTE: use an empty scope to avoid test exe using NCCLID
+            test_scope = fluid.Scope()
+        test_exe = fluid.ParallelExecutor(
+            True, main_program=test_prog, share_vars_from=exe)
+
    for pass_id in range(args.pass_num):
        num_samples = 0
        iters = 0
        start_time = time.time()
        if not args.use_reader_op:
-            reader_generator = train_reader()
+            reader_generator = train_args[3]()  #train_reader
        batch_id = 0
        data = None
+        if args.use_reader_op:
+            train_args[4].start()
        while True:
            if not args.use_reader_op:
                data = next(reader_generator, None)
                if data == None:
                    break
+            if args.profile and batch_id == 5:
+                profiler.start_profiler("All")
+                profiler.reset_profiler()
+            elif args.profile and batch_id == 10:
+                print("profiling total time: ", time.time() - start_time)
+                profiler.stop_profiler("total", "/tmp/profile_%d_pass%d" %
+                                       (trainer_id, pass_id))
            if iters == args.iterations:
                reader_generator.close()
                break
-            if args.profile and pass_id == 0 and batch_id == 5:
-                profiler.start_profiler("All")
-            elif args.profile and pass_id == 0 and batch_id == 10:
-                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)

            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
+            fetch_list = [avg_loss.name]
+            acc_name_list = [v.name for v in train_args[2]]
+            fetch_list.extend(acc_name_list)
+
            if args.use_fake_data or args.use_reader_op:
                try:
-                    loss, = exe.run([avg_loss.name])
+                    fetch_ret = exe.run(fetch_list)
+                except fluid.core.EOFException as eof:
+                    break
                except fluid.core.EnforceNotMet as ex:
+                    traceback.print_exc()
                    break
            else:
-                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+                fetch_ret = exe.run(fetch_list, feed=feeder.feed(data))
            if args.use_reader_op:
                num_samples += args.batch_size * args.gpus
            else:
                num_samples += len(data)
+
            iters += 1
            if batch_id % 1 == 0:
-                print("Pass %d, batch %d, loss %s" %
-                      (pass_id, batch_id, np.array(loss)))
+                fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
+                print("Pass %d, batch %d, loss %s, accucacys: %s" %
+                      (pass_id, batch_id, fetched_data[0], fetched_data[1:]))
            batch_id += 1

        print_train_time(start_time, time.time(), num_samples)
-        if not args.no_test and batch_acc and not args.use_reader_op:
-            # we have not implement record io for test
-            # skip test when use args.use_reader_op
-            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
-                            batch_acc)
-            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
+        if args.use_reader_op:
+            train_args[4].reset()  # reset reader handle
+        else:
+            del reader_generator
+
+        if not args.no_test and test_args[2]:
+            test_feeder = None
+            if not args.use_reader_op:
+                test_feed_var_list = [
+                    var for var in test_prog.global_block().vars.itervalues()
+                    if var.is_data
+                ]
+                test_feeder = fluid.DataFeeder(test_feed_var_list, place)
+            test_ret = test_parallel(test_exe, test_args, args, test_prog,
+                                     test_feeder)
+            print("Pass: %d, Test Accuracy: %s\n" %
+                  (pass_id, [np.mean(np.array(v)) for v in test_ret]))
+
+    print("total train time: ", time.time() - over_all_start)


 def print_arguments(args):
@@ -328,44 +322,46 @@ def main():
    if args.use_cprof:
        pr = cProfile.Profile()
        pr.enable()
+
    model_def = __import__("models.%s" % args.model, fromlist=["models"])
-    train_args = list(model_def.get_model(args))
-    train_args.append(args)
-    # Run optimizer.minimize(avg_loss)
-    train_args[2].minimize(train_args[0])
-    if args.memory_optimize:
-        fluid.memory_optimize(fluid.default_main_program())
+
+    train_prog = fluid.Program()
+    test_prog = fluid.Program()
+    startup_prog = fluid.Program()
+
+    train_args = list(model_def.get_model(args, True, train_prog, startup_prog))
+    test_args = list(model_def.get_model(args, False, test_prog, startup_prog))
+
+    all_args = [train_args, test_args, args]

    if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile(trainer_id, args)
+        train_prog, startup_prog = dist_transpile(trainer_id, args, train_prog,
+                                                  startup_prog)
        if not train_prog:
            raise Exception(
                "Must configure correct environments to run dist train.")
-        train_args.extend([train_prog, startup_prog])
+        all_args.extend([train_prog, test_prog, startup_prog])
        if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
-            train_args.extend([nccl_id_var, num_trainers, trainer_id])
-            train_parallel(*train_args)
-        train(*train_args)
+            all_args.extend([nccl_id_var, num_trainers, trainer_id])
+            train_parallel(*all_args)
+        elif os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
+            # start pserver with Executor
+            server_exe = fluid.Executor(fluid.CPUPlace())
+            server_exe.run(startup_prog)
+            server_exe.run(train_prog)
        exit(0)

    # for other update methods, use default programs
-    train_args.append(fluid.default_main_program())
-    train_args.append(fluid.default_startup_program())
+    all_args.extend([train_prog, test_prog, startup_prog])

    if args.update_method == "nccl2":
-        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
-    if args.gpus == 1:
-        # NOTE: parallel executor use profiler interanlly
-        if args.use_nvprof and args.device == 'GPU':
-            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
-                train(*train_args)
-        else:
-            train(*train_args)
-    else:
-        if args.device == "CPU":
-            raise Exception("Only support GPU perf with parallel exe")
-        train_args.extend([nccl_id_var, num_trainers, trainer_id])
-        train_parallel(*train_args)
+        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(
+            trainer_id, startup_prog)
+
+    if args.device == "CPU":
+        raise Exception("Only support GPU perf with parallel exe")
+    all_args.extend([nccl_id_var, num_trainers, trainer_id])
+    train_parallel(*all_args)


 if __name__ == "__main__":

--- a/benchmark/fluid/imagenet_reader.py
+++ b/benchmark/fluid/imagenet_reader.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import random
+import functools
+import numpy as np
+from threading import Thread
+import subprocess
+import time
+
+from Queue import Queue
+import paddle
+from PIL import Image, ImageEnhance
+
+random.seed(0)
+
+DATA_DIM = 224
+
+THREAD = int(os.getenv("PREPROCESS_THREADS", "10"))
+BUF_SIZE = 5120
+
+DATA_DIR = '/mnt/ImageNet'
+TRAIN_LIST = '/mnt/ImageNet/train.txt'
+TEST_LIST = '/mnt/ImageNet/val.txt'
+
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+
+
+def resize_short(img, target_size):
+    percent = float(target_size) / min(img.size[0], img.size[1])
+    resized_width = int(round(img.size[0] * percent))
+    resized_height = int(round(img.size[1] * percent))
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    return img
+
+
+def crop_image(img, target_size, center):
+    width, height = img.size
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = random.randint(0, width - size)
+        h_start = random.randint(0, height - size)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img.crop((w_start, h_start, w_end, h_end))
+    return img
+
+
+def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
+    aspect_ratio = math.sqrt(random.uniform(*ratio))
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+
+    bound = min((float(img.size[0]) / img.size[1]) / (w**2),
+                (float(img.size[1]) / img.size[0]) / (h**2))
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+
+    target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
+                                                             scale_max)
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+
+    i = random.randint(0, img.size[0] - w)
+    j = random.randint(0, img.size[1] - h)
+
+    img = img.crop((i, j, i + w, j + h))
+    img = img.resize((size, size), Image.LANCZOS)
+    return img
+
+
+def rotate_image(img):
+    angle = random.randint(-10, 10)
+    img = img.rotate(angle)
+    return img
+
+
+def distort_color(img):
+    def random_brightness(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Brightness(img).enhance(e)
+
+    def random_contrast(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Contrast(img).enhance(e)
+
+    def random_color(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Color(img).enhance(e)
+
+    ops = [random_brightness, random_contrast, random_color]
+    random.shuffle(ops)
+
+    img = ops[0](img)
+    img = ops[1](img)
+    img = ops[2](img)
+
+    return img
+
+
+def process_image(sample, mode, color_jitter, rotate):
+    img_path = sample[0]
+
+    img = Image.open(img_path)
+    if mode == 'train':
+        if rotate: img = rotate_image(img)
+        img = random_crop(img, DATA_DIM)
+    else:
+        img = resize_short(img, target_size=256)
+        img = crop_image(img, target_size=DATA_DIM, center=True)
+    if mode == 'train':
+        if color_jitter:
+            img = distort_color(img)
+        if random.randint(0, 1) == 1:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
+    img -= img_mean
+    img /= img_std
+
+    if mode == 'train' or mode == 'val':
+        return img, sample[1]
+    elif mode == 'test':
+        return [img]
+
+
+class XmapEndSignal():
+    pass
+
+
+def xmap_readers(mapper,
+                 reader,
+                 process_num,
+                 buffer_size,
+                 order=False,
+                 print_queue_state=True):
+    end = XmapEndSignal()
+
+    # define a worker to read samples from reader to in_queue
+    def read_worker(reader, in_queue):
+        for i in reader():
+            in_queue.put(i)
+        in_queue.put(end)
+
+    # define a worker to read samples from reader to in_queue with order flag
+    def order_read_worker(reader, in_queue, file_queue):
+        in_order = 0
+        for i in reader():
+            in_queue.put((in_order, i))
+            in_order += 1
+        in_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue
+    def handle_worker(in_queue, out_queue, mapper):
+        sample = in_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            r = mapper(sample)
+            out_queue.put(r)
+            sample = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue by order
+    def order_handle_worker(in_queue, out_queue, mapper, out_order):
+        ins = in_queue.get()
+        while not isinstance(ins, XmapEndSignal):
+            order, sample = ins
+            r = mapper(sample)
+            while order != out_order[0]:
+                pass
+            out_queue.put(r)
+            out_order[0] += 1
+            ins = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    def xreader():
+        file_queue = Queue()
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+
+        sample = out_queue.get()
+        start_t = time.time()
+        while not isinstance(sample, XmapEndSignal):
+            yield sample
+            sample = out_queue.get()
+            if time.time() - start_t > 3:
+                if print_queue_state:
+                    print("queue sizes: ", in_queue.qsize(), out_queue.qsize())
+                start_t = time.time()
+        finish = 1
+        while finish < process_num:
+            sample = out_queue.get()
+            if isinstance(sample, XmapEndSignal):
+                finish += 1
+            else:
+                yield sample
+
+    return xreader
+
+
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False,
+                    xmap=True):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(full_lines)
+            if mode == 'train':
+                trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+                trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+                per_node_lines = len(full_lines) / trainer_count
+                lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
+                                   * per_node_lines]
+                print(
+                    "read images from %d, length: %d, lines length: %d, total: %d"
+                    % (trainer_id * per_node_lines, per_node_lines, len(lines),
+                       len(full_lines)))
+            else:
+                lines = full_lines
+
+            for line in lines:
+                if mode == 'train':
+                    img_path, label = line.split()
+                    img_path = img_path.replace("JPEG", "jpeg")
+                    img_path = os.path.join(DATA_DIR, "train", img_path)
+                    yield (img_path, int(label))
+                elif mode == 'val':
+                    img_path, label = line.split()
+                    img_path = img_path.replace("JPEG", "jpeg")
+                    img_path = os.path.join(DATA_DIR, "val", img_path)
+                    yield (img_path, int(label))
+                elif mode == 'test':
+                    img_path = os.path.join(DATA_DIR, line)
+                    yield [img_path]
+
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+
+    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
+
+
+def load_raw_image_uint8(sample):
+    img_arr = np.array(Image.open(sample[0])).astype('int64')
+    return img_arr, int(sample[1])
+
+
+def train_raw(file_list=TRAIN_LIST, shuffle=True):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(full_lines)
+
+            trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+            trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+            per_node_lines = len(full_lines) / trainer_count
+            lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1) *
+                               per_node_lines]
+            print("read images from %d, length: %d, lines length: %d, total: %d"
+                  % (trainer_id * per_node_lines, per_node_lines, len(lines),
+                     len(full_lines)))
+
+            for line in lines:
+                img_path, label = line.split()
+                img_path = img_path.replace("JPEG", "jpeg")
+                img_path = os.path.join(DATA_DIR, "train", img_path)
+                yield (img_path, int(label))
+
+    return paddle.reader.xmap_readers(load_raw_image_uint8, reader, THREAD,
+                                      BUF_SIZE)
+
+
+def train(file_list=TRAIN_LIST, xmap=True):
+    return _reader_creator(
+        file_list,
+        'train',
+        shuffle=True,
+        color_jitter=False,
+        rotate=False,
+        xmap=xmap)
+
+
+def val(file_list=TEST_LIST, xmap=True):
+    return _reader_creator(file_list, 'val', shuffle=False, xmap=xmap)
+
+
+def test(file_list=TEST_LIST):
+    return _reader_creator(file_list, 'test', shuffle=False)
+
+
+if __name__ == "__main__":
+    c = 0
+    start_t = time.time()
+    for d in train()():
+        c += 1
+        if c >= 10000:
+            break
+    spent = time.time() - start_t
+    print("read 10000 speed: ", 10000 / spent, spent)
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -163,6 +163,19 @@ def gen_job():
        volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
        volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})

+    # add ceph volumes
+    volumes.append({
+        "name": "ceph-data",
+        "cephfs": {
+            "monitors": ["192.168.16.23:6789"],
+            "secretRef": {
+                "name": "ceph-secret"
+            },
+            "user": "admin",
+        }
+    })
+    volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
+
    tn["spec"]["template"]["spec"]["volumes"] = volumes
    tn_container["volumeMounts"] = volumeMounts


--- a/benchmark/fluid/models/__init__.py
+++ b/benchmark/fluid/models/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.

 __all__ = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
+    "resnet_with_preprocess"
 ]
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """seq2seq model for fluid."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -181,7 +182,7 @@ def lodtensor_to_ndarray(lod_tensor):
    return ndarray


-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
    if args.use_reader_op:
        raise Exception("machine_translation do not support reader op for now.")
    embedding_dim = 512
@@ -190,30 +191,27 @@ def get_model(args):
    dict_size = 30000
    beam_size = 3
    max_length = 250
-    avg_cost, feeding_list = seq_to_seq_net(
-        embedding_dim,
-        encoder_size,
-        decoder_size,
-        dict_size,
-        dict_size,
-        False,
-        beam_size=beam_size,
-        max_length=max_length)
-
-    # clone from default main program
-    inference_program = fluid.default_main_program().clone()
-
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-
-    train_batch_generator = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=args.batch_size * args.gpus)

-    test_batch_generator = paddle.batch(
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            avg_cost, feeding_list = seq_to_seq_net(
+                embedding_dim,
+                encoder_size,
+                decoder_size,
+                dict_size,
+                dict_size,
+                False,
+                beam_size=beam_size,
+                max_length=max_length)
+    if is_train:
+        optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+        optimizer.minimize(avg_cost)
+
+    batch_generator = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.wmt14.test(dict_size), buf_size=1000),
-        batch_size=args.batch_size)
+            paddle.dataset.wmt14.train(dict_size)
+            if is_train else paddle.dataset.wmt14.test(dict_size),
+            buf_size=1000),
+        batch_size=args.batch_size * args.gpus)

-    return avg_cost, inference_program, optimizer, train_batch_generator, \
-           test_batch_generator, None
+    return avg_cost, optimizer, [], batch_generator, None
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -65,61 +65,53 @@ def cnn_model(data):
    return predict


-def get_model(args):
-    if args.use_reader_op:
-        filelist = [
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-        ]
-        data_file = fluid.layers.open_files(
-            filenames=filelist,
-            shapes=[[-1, 1, 28, 28], (-1, 1)],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        images, label = fluid.layers.read_file(data_file)
-    else:
-        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if args.device == 'CPU' and args.cpus > 1:
-        places = fluid.layers.get_places(args.cpus)
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            predict = cnn_model(pd.read_input(images))
-            label = pd.read_input(label)
+def get_model(args, is_train, main_prog, startup_prog):
+    # NOTE: mnist is small, we don't implement data sharding yet.
+    opt = None
+    data_file_handle = None
+    with fluid.program_guard(main_prog, startup_prog):
+        if args.use_reader_op:
+            filelist = [
+                os.path.join(args.data_path, f)
+                for f in os.listdir(args.data_path)
+            ]
+            data_file_handle = fluid.layers.open_files(
+                filenames=filelist,
+                shapes=[[-1, 1, 28, 28], (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"],
+                thread_num=1,
+                pass_num=1)
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(
+                    data_file_handle, batch_size=args.batch_size))
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                input, label = fluid.layers.read_file(data_file)
+            else:
+                images = fluid.layers.data(
+                    name='pixel', shape=[1, 28, 28], dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            predict = cnn_model(images)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
+            # Evaluator
            batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-            pd.write_output(avg_cost)
-            pd.write_output(batch_acc)
-
-        avg_cost, batch_acc = pd()
-        avg_cost = fluid.layers.mean(avg_cost)
-        batch_acc = fluid.layers.mean(batch_acc)
-    else:
-        # Train program
-        predict = cnn_model(images)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        # Evaluator
-        batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-
-    # Optimization
-    opt = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.001, beta1=0.9, beta2=0.999)
+            # Optimization
+            if is_train:
+                opt = fluid.optimizer.AdamOptimizer(
+                    learning_rate=0.001, beta1=0.9, beta2=0.999)
+                opt.minimize(avg_cost)
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)

    # Reader
-    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=args.batch_size)
-    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
+    if is_train:
+        reader = paddle.dataset.mnist.train()
+    else:
+        reader = paddle.dataset.mnist.test()
+    batched_reader = paddle.batch(
+        reader, batch_size=args.batch_size * args.gpus)
+    return avg_cost, opt, [batch_acc], batched_reader, data_file_handle
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -20,6 +20,7 @@ import functools
 import numpy as np
 import time
 import os
+import math

 import cProfile, pstats, StringIO

@@ -27,182 +28,210 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
-from recordio_converter import imagenet_train, imagenet_test
-
-
-def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
-    conv1 = fluid.layers.conv2d(
-        input=input,
-        filter_size=filter_size,
-        num_filters=ch_out,
-        stride=stride,
-        padding=padding,
-        act=None,
-        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv1, act=act)
-
-
-def shortcut(input, ch_out, stride):
-    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
-    if ch_in != ch_out:
-        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
-    else:
-        return input
-
-
-def basicblock(input, ch_out, stride):
-    short = shortcut(input, ch_out, stride)
-    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
-    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-
-
-def bottleneck(input, ch_out, stride):
-    short = shortcut(input, ch_out * 4, stride)
-    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
-    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
-    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+from imagenet_reader import train, val
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class ResNet():
+    def __init__(self, layers=50, is_train=True):
+        self.params = train_parameters
+        self.layers = layers
+        self.is_train = is_train
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+
+        conv = self.conv_bn_layer(
+            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(input=pool,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(
+            input=conv, act=act, is_test=not self.is_train)
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            return self.conv_bn_layer(input, ch_out, 1, stride)
+        else:
+            return input

+    def bottleneck_block(self, input, num_filters, stride):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)

-def layer_warp(block_func, input, ch_out, count, stride):
-    res_out = block_func(input, ch_out, stride)
-    for i in range(1, count):
-        res_out = block_func(res_out, ch_out, 1)
-    return res_out
+        short = self.shortcut(input, num_filters * 4, stride)

+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')

-def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):

-    cfg = {
-        18: ([2, 2, 2, 1], basicblock),
-        34: ([3, 4, 6, 3], basicblock),
-        50: ([3, 4, 6, 3], bottleneck),
-        101: ([3, 4, 23, 3], bottleneck),
-        152: ([3, 8, 36, 3], bottleneck)
-    }
-    stages, block_func = cfg[depth]
-    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
-    pool1 = fluid.layers.pool2d(
-        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
-    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
-    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
-    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
-    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
-    pool2 = fluid.layers.pool2d(
-        input=res4,
-        pool_size=7,
-        pool_type='avg',
-        pool_stride=1,
-        global_pooling=True)
-    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
-    return out
-
-
-def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
-    assert (depth - 2) % 6 == 0
-
-    n = (depth - 2) // 6
-
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
-    return out
-
-
-def get_model(args):
-    model = resnet_cifar10
-    if args.data_set == "cifar10":
-        class_dim = 10
-        if args.data_format == 'NCHW':
-            dshape = [3, 32, 32]
-        else:
-            dshape = [32, 32, 3]
-        model = resnet_cifar10
-        train_reader = paddle.dataset.cifar.train10()
-        test_reader = paddle.dataset.cifar.test10()
-    elif args.data_set == "flowers":
+def _model_reader_dshape_classdim(args, is_train):
+    model = None
+    reader = None
+    if args.data_set == "flowers":
        class_dim = 102
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]
-        model = resnet_imagenet
-        train_reader = paddle.dataset.flowers.train()
-        test_reader = paddle.dataset.flowers.test()
+        if is_train:
+            reader = paddle.dataset.flowers.train()
+        else:
+            reader = paddle.dataset.flowers.test()
    elif args.data_set == "imagenet":
        class_dim = 1000
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]
-        model = resnet_imagenet
        if not args.data_path:
            raise Exception(
                "Must specify --data_path when training with imagenet")
-        train_reader = imagenet_train(args.data_path)
-        test_reader = imagenet_test(args.data_path)
-
-    if args.use_reader_op:
-        filelist = [
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-        ]
-        data_file = fluid.layers.open_files(
-            filenames=filelist,
-            shapes=[[-1] + dshape, (-1, 1)],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        input, label = fluid.layers.read_file(data_file)
-    else:
-        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if args.device == 'CPU' and args.cpus > 1:
-        places = fluid.layers.get_places(args.cpus)
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            predict = model(pd.read_input(input), class_dim)
-            label = pd.read_input(label)
+        if not args.use_reader_op:
+            if is_train:
+                reader = train()
+            else:
+                reader = val()
+        else:
+            if is_train:
+                reader = train(xmap=False)
+            else:
+                reader = val(xmap=False)
+    return reader, dshape, class_dim
+
+
+def get_model(args, is_train, main_prog, startup_prog):
+    reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train)
+
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=args.batch_size * args.gpus,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('float32', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            model = ResNet(is_train=is_train)
+            predict = model.net(input, class_dim=class_dim)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
-            batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-            pd.write_output(avg_cost)
-            pd.write_output(batch_acc)

-        avg_cost, batch_acc = pd()
-        avg_cost = fluid.layers.mean(avg_cost)
-        batch_acc = fluid.layers.mean(batch_acc)
+            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
+            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
+
+            # configure optimize
+            optimizer = None
+            if is_train:
+                total_images = 1281167 / trainer_count
+
+                step = int(total_images / (args.batch_size * args.gpus) + 1)
+                epochs = [30, 60, 90]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    learning_rate=fluid.layers.piecewise_decay(
+                        boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+
+    # config readers
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader if args.no_random else paddle.reader.shuffle(
+                reader, buf_size=5120),
+            batch_size=args.batch_size * args.gpus,
+            drop_last=True)
    else:
-        predict = model(input, class_dim)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-        batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc])
-
-    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-
-    batched_train_reader = paddle.batch(
-        train_reader if args.no_random else paddle.reader.shuffle(
-            train_reader, buf_size=5120),
-        batch_size=args.batch_size * args.gpus,
-        drop_last=True)
-    batched_test_reader = paddle.batch(
-        test_reader, batch_size=args.batch_size, drop_last=True)
-
-    return avg_cost, inference_program, optimizer, batched_train_reader,\
-                   batched_test_reader, batch_acc
+        batched_reader = None
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                reader if args.no_random else paddle.reader.shuffle(
+                    reader, buf_size=5120),
+                batch_size=args.batch_size))
+
+    return avg_cost, optimizer, [batch_acc1,
+                                 batch_acc5], batched_reader, pyreader
--- a/benchmark/fluid/models/resnet_with_preprocess.py
+++ b/benchmark/fluid/models/resnet_with_preprocess.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import numpy as np
+import time
+import os
+
+import cProfile, pstats, StringIO
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+# from recordio_converter import imagenet_train, imagenet_test
+from imagenet_reader import train_raw, val
+
+
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  act='relu',
+                  is_train=True):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
+
+
+def shortcut(input, ch_out, stride, is_train=True):
+    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(
+            input, ch_out, 1, stride, 0, None, is_train=is_train)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out * 4, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
+    conv3 = conv_bn_layer(
+        conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input,
+                    class_dim,
+                    depth=50,
+                    data_format='NCHW',
+                    is_train=True):
+
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+
+
+def _model_reader_dshape_classdim(args, is_train):
+    model = resnet_cifar10
+    reader = None
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+        model = resnet_cifar10
+        if is_train:
+            reader = paddle.dataset.cifar.train10()
+        else:
+            reader = paddle.dataset.cifar.test10()
+    elif args.data_set == "flowers":
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if is_train:
+            reader = paddle.dataset.flowers.train()
+        else:
+            reader = paddle.dataset.flowers.test()
+    elif args.data_set == "imagenet":
+        class_dim = 1000
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if not args.data_path:
+            raise Exception(
+                "Must specify --data_path when training with imagenet")
+        if not args.use_reader_op:
+            if is_train:
+                reader = train_raw()
+            else:
+                reader = val()
+        else:
+            if is_train:
+                reader = train_raw()
+            else:
+                reader = val(xmap=False)
+    return model, reader, dshape, class_dim
+
+
+def get_model(args, is_train, main_prog, startup_prog):
+    model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
+                                                                     is_train)
+
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=args.batch_size * args.gpus,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('uint8', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='uint8')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            # add imagenet preprocessors
+            random_crop = fluid.layers.random_crop(input, dshape)
+            casted = fluid.layers.cast(random_crop, 'float32')
+            # input is HWC
+            trans = fluid.layers.transpose(casted, [0, 3, 1, 2]) / 255.0
+            img_mean = fluid.layers.tensor.assign(
+                np.array([0.485, 0.456, 0.406]).astype('float32').reshape((3, 1,
+                                                                           1)))
+            img_std = fluid.layers.tensor.assign(
+                np.array([0.229, 0.224, 0.225]).astype('float32').reshape((3, 1,
+                                                                           1)))
+            h1 = fluid.layers.elementwise_sub(trans, img_mean, axis=1)
+            h2 = fluid.layers.elementwise_div(h1, img_std, axis=1)
+
+            # pre_out = (trans - img_mean) / img_std
+
+            predict = model(h2, class_dim, is_train=is_train)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+
+            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
+            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
+
+            # configure optimize
+            optimizer = None
+            if is_train:
+                total_images = 1281167 / trainer_count
+
+                step = int(total_images / args.batch_size + 1)
+                epochs = [30, 60, 80, 90]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    learning_rate=base_lr,
+                    #learning_rate=fluid.layers.piecewise_decay(
+                    #    boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+
+    # config readers
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader if args.no_random else paddle.reader.shuffle(
+                reader, buf_size=5120),
+            batch_size=args.batch_size * args.gpus,
+            drop_last=True)
+    else:
+        batched_reader = None
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                # reader if args.no_random else paddle.reader.shuffle(
+                #     reader, buf_size=5120),
+                reader,
+                batch_size=args.batch_size))
+
+    return avg_cost, optimizer, [batch_acc1,
+                                 batch_acc5], batched_reader, pyreader
--- a/benchmark/fluid/models/se_resnext.py
+++ b/benchmark/fluid/models/se_resnext.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import math
+import os
+from imagenet_reader import train, val
+
+__all__ = [
+    "SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d",
+    "SE_ResNeXt152_32x4d", "get_model"
+]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class SE_ResNeXt():
+    def __init__(self, layers=50, is_train=True):
+        self.params = train_parameters
+        self.layers = layers
+        self.is_train = is_train
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu')
+            conv = self.conv_bn_layer(
+                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(input=drop,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(input, ch_out, filter_size, stride)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, cardinality,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio)
+
+        short = self.shortcut(input, num_filters * 2, stride)
+
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(
+            input=conv, act=act, is_test=not self.is_train)
+
+    def squeeze_excitation(self, input, num_channels, reduction_ratio):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(input=pool,
+                                  size=num_channels / reduction_ratio,
+                                  act='relu',
+                                  param_attr=fluid.param_attr.ParamAttr(
+                                      initializer=fluid.initializer.Uniform(
+                                          -stdv, stdv)))
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(input=squeeze,
+                                     size=num_channels,
+                                     act='sigmoid',
+                                     param_attr=fluid.param_attr.ParamAttr(
+                                         initializer=fluid.initializer.Uniform(
+                                             -stdv, stdv)))
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+
+
+def SE_ResNeXt50_32x4d():
+    model = SE_ResNeXt(layers=50)
+    return model
+
+
+def SE_ResNeXt101_32x4d():
+    model = SE_ResNeXt(layers=101)
+    return model
+
+
+def SE_ResNeXt152_32x4d():
+    model = SE_ResNeXt(layers=152)
+    return model
+
+
+def get_model(args, is_train, main_prog, startup_prog):
+    model = SE_ResNeXt(layers=50)
+    batched_reader = None
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    dshape = train_parameters["input_size"]
+
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=10,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('float32', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            out = model.net(input=input)
+            cost = fluid.layers.cross_entropy(input=out, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+            acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+            optimizer = None
+            if is_train:
+                total_images = 1281167 / trainer_count
+
+                step = int(total_images / args.batch_size + 1)
+                epochs = [40, 80, 100]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    # learning_rate=base_lr,
+                    learning_rate=fluid.layers.piecewise_decay(
+                        boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+
+    # config readers
+    if is_train:
+        reader = train()
+    else:
+        reader = val()
+
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader, batch_size=args.batch_size * args.gpus, drop_last=True)
+    else:
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                reader, batch_size=args.batch_size))
+
+    return avg_cost, optimizer, [acc_top1, acc_top5], batched_reader, pyreader
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -26,7 +26,6 @@ import numpy
 import paddle
 import paddle.dataset.imdb as imdb
 import paddle.fluid as fluid
-import paddle.batch as batch
 import paddle.fluid.profiler as profiler

 word_dict = imdb.word_dict()
@@ -43,19 +42,7 @@ def crop_sentence(reader, crop_size):
    return __impl__


-def get_model(args):
-    if args.use_reader_op:
-        raise Exception(
-            "stacked_dynamic_lstm do not support reader op for now.")
-    lstm_size = 512
-    emb_dim = 512
-    crop_size = 1500
-
-    data = fluid.layers.data(
-        name="words", shape=[1], lod_level=1, dtype='int64')
-    sentence = fluid.layers.embedding(
-        input=data, size=[len(word_dict), emb_dim])
-
+def lstm_net(sentence, lstm_size):
    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')

    rnn = fluid.layers.DynamicRNN()
@@ -97,31 +84,47 @@ def get_model(args):

    last = fluid.layers.sequence_pool(rnn(), 'last')
    logit = fluid.layers.fc(input=last, size=2, act='softmax')
-    loss = fluid.layers.cross_entropy(
-        input=logit,
-        label=fluid.layers.data(
-            name='label', shape=[1], dtype='int64'))
-    loss = fluid.layers.mean(x=loss)
+    return logit

-    # add acc
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
-                shape=[1], dtype='int64'), total=batch_size_tensor)

-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-
-    adam = fluid.optimizer.Adam()
+def get_model(args, is_train, main_prog, startup_prog):
+    if args.use_reader_op:
+        raise Exception(
+            "stacked_dynamic_lstm do not support reader op for now.")
+    lstm_size = 512
+    emb_dim = 512
+    crop_size = 1500

-    train_reader = batch(
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            data = fluid.layers.data(
+                name="words", shape=[1], lod_level=1, dtype='int64')
+            sentence = fluid.layers.embedding(
+                input=data, size=[len(word_dict), emb_dim])
+            logit = lstm_net(sentence, lstm_size)
+            loss = fluid.layers.cross_entropy(
+                input=logit,
+                label=fluid.layers.data(
+                    name='label', shape=[1], dtype='int64'))
+            loss = fluid.layers.mean(x=loss)
+
+            # add acc
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                        shape=[1], dtype='int64'), total=batch_size_tensor)
+
+            if is_train:
+                adam = fluid.optimizer.Adam()
+                adam.minimize(loss)
+
+    if is_train:
+        reader = crop_sentence(imdb.train(word_dict), crop_size)
+    else:
+        reader = crop_sentence(imdb.test(word_dict), crop_size)
+
+    batched_reader = paddle.batch(
        paddle.reader.shuffle(
-            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
+            reader, buf_size=25000),
        batch_size=args.batch_size * args.gpus)
-    test_reader = batch(
-        paddle.reader.shuffle(
-            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
-        batch_size=args.batch_size)

-    return loss, inference_program, adam, train_reader, test_reader, batch_acc
+    return loss, adam, [batch_acc], batched_reader, None
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -25,7 +25,7 @@ import functools
 import os


-def vgg16_bn_drop(input):
+def vgg16_bn_drop(input, is_train=True):
    def conv_block(input, num_filter, groups, dropouts):
        return fluid.nets.img_conv_group(
            input=input,
@@ -46,13 +46,13 @@ def vgg16_bn_drop(input):

    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    bn = fluid.layers.batch_norm(input=fc1, act='relu', is_test=not is_train)
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
    return fc2


-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
    if args.data_set == "cifar10":
        classdim = 10
        if args.data_format == 'NCHW':
@@ -65,57 +65,56 @@ def get_model(args):
            data_shape = [3, 224, 224]
        else:
            data_shape = [224, 224, 3]
+    filelist = [
+        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+    ]
+    with fluid.program_guard(main_prog, startup_prog):
+        if args.use_reader_op:
+            data_file_handle = fluid.layers.open_files(
+                filenames=filelist,
+                shapes=[[-1] + data_shape, (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"],
+                thread_num=1,
+                pass_num=1)
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(
+                    data_file_handle, batch_size=args.batch_size))
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                images, label = fluid.layers.read_file(data_file)
+            else:
+                images = fluid.layers.data(
+                    name='data', shape=data_shape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+            # Train program
+            net = vgg16_bn_drop(images, is_train=is_train)
+            predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)

-    if args.use_reader_op:
-        filelist = [
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-        ]
-        data_file = fluid.layers.open_files(
-            filenames=filelist,
-            shapes=[[-1] + data_shape, (-1, 1)],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        images, label = fluid.layers.read_file(data_file)
-    else:
-        images = fluid.layers.data(
-            name='data', shape=data_shape, dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+            # Evaluator
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(
+                input=predict, label=label, total=batch_size_tensor)
+            # Optimization
+            if is_train:
+                optimizer = fluid.optimizer.Adam(
+                    learning_rate=args.learning_rate)
+                optimizer.minimize(avg_cost)

    # data reader
-    train_reader = paddle.batch(
+    if is_train:
+        reader = paddle.dataset.cifar.train10() \
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train()
+    else:
+        reader = paddle.dataset.cifar.test10() \
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test()
+
+    batched_reader = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
+            reader, buf_size=5120),
        batch_size=args.batch_size * args.gpus)
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=args.batch_size)

-    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
+    return avg_cost, optimizer, [batch_acc], batched_reader, data_file_handle
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -40,7 +40,7 @@ set(OPENBLAS_LIB_SEARCH_PATHS
        /usr/local/opt/openblas/lib)

 find_path(OPENBLAS_INC_DIR NAMES cblas.h
-  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
 find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
 find_library(OPENBLAS_LIB NAMES openblas

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -178,9 +178,11 @@ else(NOT WIN32)
 list(APPEND CUDA_NVCC_FLAGS "-w" "-Xcompiler -fPIC" "-Xcompiler /w")
 endif(NOT WIN32)

-list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
+if(WITH_FAST_MATH)
+  # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 # in cuda9, suppress cuda warning on eigen 
-list(APPEND CUDA_NVCC_FLAGS "-w")
+
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")


--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -16,16 +16,6 @@ set(ANAKIN_LIBRARY     ${ANAKIN_INSTALL_DIR})
 set(ANAKIN_SHARED_LIB  ${ANAKIN_LIBRARY}/libanakin.so)
 set(ANAKIN_SABER_LIB   ${ANAKIN_LIBRARY}/libanakin_saber_common.so)

-# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
-set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
-set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
-set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
-set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
-
 include_directories(${ANAKIN_INCLUDE})
 include_directories(${ANAKIN_INCLUDE}/saber/)
 include_directories(${ANAKIN_INCLUDE}/saber/core/)
@@ -48,21 +38,25 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
    -Wno-reorder
    -Wno-error=cpp)

+if(WITH_GPU)
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=YES -DCUDNN_ROOT=${CUDNN_ROOT} -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR})
+else()
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=NO)
+endif()
 ExternalProject_Add(
    extern_anakin
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLML_PROJECT}
    GIT_REPOSITORY      "https://github.com/PaddlePaddle/Anakin"
-    GIT_TAG             "9424277cf9ae180a14aff09560d3cd60a49c76d2"
+    GIT_TAG             "3c8554f4978628183566ab7dd6c1e7e66493c7cd"
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
-    CMAKE_ARGS          -DUSE_GPU_PLACE=YES
+    CMAKE_ARGS          ${CMAKE_ARGS_PREFIX}
+                        -DUSE_LOGGER=YES
                        -DUSE_X86_PLACE=YES
                        -DBUILD_WITH_UNIT_TEST=NO
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                        -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-                        -DCUDNN_ROOT=${CUDNN_ROOT}
-                        -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
                        -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
                        ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -3,6 +3,14 @@ INCLUDE(ExternalProject)
 SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
+if(NOT WITH_FAST_MATH)
+  # EIGEN_FAST_MATH: https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html
+  # enables some optimizations which might affect the accuracy of the result. 
+  # This currently enables the SSE vectorization of sin() and cos(), 
+  # and speedups sqrt() for single precision.
+  # Defined to 1 by default. Define it to 0 to disable.
+  add_definitions(-DEIGEN_FAST_MATH=0)
+endif()

 if(WITH_AMD_GPU)
    ExternalProject_Add(

--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -44,7 +44,7 @@ ExternalProject_Add(
    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
    #    checkout and clean other dirs under third_party
    # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
    URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -29,7 +29,7 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
  MESSAGE(STATUS "use pre defined download url")
-  SET(MKLML_VER "mklml_lnx_2018.0.3.20180406" CACHE STRING "" FORCE)
+  SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
  SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
 ENDIF()
 MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -27,7 +27,7 @@ IF(NOT ${CBLAS_FOUND})

    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-    SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)

    SET(CBLAS_LIBRARIES
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
@@ -96,7 +96,7 @@ IF(NOT ${CBLAS_FOUND})
    ENDIF(NOT WIN32)
    SET(CBLAS_PROVIDER openblas)
    IF(WITH_C_API)
-        INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas)
+        INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
        # Because libopenblas.a is a symbolic link of another library, thus need to
        # install the whole directory.
        IF(ANDROID)
@@ -117,8 +117,8 @@ IF(NOT ${CBLAS_FOUND})
 ENDIF(NOT ${CBLAS_FOUND})

 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
-MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}")
-INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR})
+MESSAGE(STATUS "BLAS Include: ${CBLAS_INC_DIR}")
+INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})

 # FIXME(gangliao): generate cblas target to track all high performance
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)

--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
+INCLUDE(ExternalProject)
+
+set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
+set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
+set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
+
+IF(WITH_STATIC_LIB)
+  SET(BUILD_CMD make lib)
+ELSE()
+  IF(APPLE)
+    SET(BUILD_CMD sed -i \"\" "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
+  ELSE(APPLE)
+    SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
+  ENDIF(APPLE)
+ENDIF()
+
+ExternalProject_Add(
+    extern_xxhash
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
+    GIT_TAG         "v0.6.5"
+    PREFIX          ${XXHASH_SOURCE_DIR}
+    DOWNLOAD_NAME   "xxhash"
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    PATCH_COMMAND
+    BUILD_COMMAND     ${BUILD_CMD}
+    INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
+    TEST_COMMAND      ""
+)
+
+set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
+INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
+
+add_library(xxhash STATIC IMPORTED GLOBAL)
+set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
+include_directories(${XXHASH_INCLUDE_DIR})
+add_dependencies(xxhash extern_xxhash)
+
+LIST(APPEND external_project_dependencies xxhash)
+
+IF(WITH_C_API)
+  INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash)
+  IF(ANDROID)
+    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI})
+  ELSE()
+    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib)
+  ENDIF()
+ENDIF()
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -27,7 +27,6 @@ endfunction()

 CheckCompilerCXX11Flag()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
@@ -117,6 +116,7 @@ if (NOT WIN32)
 set(COMMON_FLAGS
    -fPIC
    -fno-omit-frame-pointer
+    -Werror
    -Wall
    -Wextra
    -Wdelete-non-virtual-dtor
@@ -159,11 +159,20 @@ set(GPU_COMMON_FLAGS

 endif(NOT WIN32)

+else(NOT WIN32)
+set(COMMON_FLAGS
+    "/w") #disable all warnings.
+set(GPU_COMMON_FLAGS
+    "/w") #disable all warnings
+endif(NOT WIN32)
+
 if (APPLE)
    if(NOT CMAKE_CROSSCOMPILING)
        # On Mac OS X build fat binaries with x86_64 architectures by default.
        set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
    endif()
+    # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
+    set (COMMON_FLAGS -Wno-deprecated-register)
 endif(APPLE)

 if(LINUX)
@@ -189,6 +198,7 @@ foreach(flag ${GPU_COMMON_FLAGS})
 endforeach()

 if(WIN32)
+# windows build turn off warnings.
 safe_set_static_flag()
    foreach(flag_var
        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -113,7 +113,7 @@ endfunction(find_fluid_modules)
 # find all third_party modules is used for paddle static library
 # for reduce the dependency when building the inference libs.
 set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
-function(find_fluid_third_partys TARGET_NAME)
+function(find_fluid_thirdparties TARGET_NAME)
  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
  string(FIND "${__target_path}" "third_party" pos)
@@ -122,7 +122,7 @@ function(find_fluid_third_partys TARGET_NAME)
    set(fluid_third_partys ${fluid_third_partys} ${TARGET_NAME})
    set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY "${fluid_third_partys}")
  endif()
-endfunction(find_fluid_third_partys)
+endfunction(find_fluid_thirdparties)

 function(merge_static_libs TARGET_NAME)
  set(libs ${ARGN})
@@ -218,18 +218,13 @@ function(merge_static_libs TARGET_NAME)

    foreach(lib ${libs})
      # Get the file names of the libraries to be merged
-      #if(NOT $<TARGET_FILE:${lib}> MATCHES "lib.*\\.lib")
-      #  message("library" ${lib})
-      #  set(libfiles ${libfiles} lib$<TARGET_FILE:${lib}>)
-      #else()
      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
-      #endif()
    endforeach()
-   
-    # windows cmd return error in clean env.
-    # COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
+    # msvc will put libarary in directory of "/Release/xxxlib" by default 
+    #       COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.lib ${libfiles}
+      COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}"
+      COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib${TARGET_NAME}.lib ${libfiles}
      )
  endif(WIN32)
 endfunction(merge_static_libs)
@@ -267,10 +262,17 @@ function(cc_library TARGET_NAME)
        add_dependencies(${TARGET_NAME} mklml)
        target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
      endif()
+      # remove link to python, see notes at:
+      # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
+      if("${cc_library_DEPS};" MATCHES "python;")
+        list(REMOVE_ITEM cc_library_DEPS python)
+        add_dependencies(${TARGET_NAME} python)
+        target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
+      endif()
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
    endif()
-    
+
    # cpplint code style
    foreach(source_file ${cc_library_SRCS})
      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
@@ -317,11 +319,12 @@ function(cc_test TARGET_NAME)
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    if (${cc_test_SERIAL})
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
-
+    endif()
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    endif()
+    # No unit test should exceed 10 minutes.
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
  endif()
 endfunction(cc_test)

@@ -389,11 +392,10 @@ function(nv_test TARGET_NAME)
    add_test(${TARGET_NAME} ${TARGET_NAME})
    if (nv_test_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
-
+    endif()
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    endif()
  endif()
 endfunction(nv_test)

@@ -581,26 +583,26 @@ function(paddle_protobuf_generate_cpp SRCS HDRS)
  set(${HDRS})

  if (MOBILE_INFERENCE)
-      set(EXTRA_FLAG "lite:")  
+      set(EXTRA_FLAG "lite:")
  else()
-      set(EXTRA_FLAG "") 
+      set(EXTRA_FLAG "")
  endif()

  foreach(FIL ${ARGN})
    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
    get_filename_component(FIL_WE ${FIL} NAME_WE)
-    
+
    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
-    
+
    add_custom_command(
      OUTPUT "${_protobuf_protoc_src}"
             "${_protobuf_protoc_hdr}"

      COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
-      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
      -I${CMAKE_CURRENT_SOURCE_DIR}
      --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
      DEPENDS ${ABS_FIL} protoc
@@ -645,6 +647,8 @@ function(py_test TARGET_NAME)
             PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    # No unit test should exceed 10 minutes.
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
  endif()
 endfunction()

@@ -669,7 +673,7 @@ function(grpc_library TARGET_NAME)
  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)

  #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
-  # somehow it didn't. line 602 to 604 is to patching this. Leaving this here 
+  # somehow it didn't. line 602 to 604 is to patching this. Leaving this here
  # for now to enable dist CI.
  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -18,7 +18,7 @@ function(copy TARGET)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DSTS DEPS)
    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
+    set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE)

    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@@ -31,7 +31,6 @@ function(copy TARGET)
    foreach(index RANGE ${len})
        list(GET copy_lib_SRCS ${index} src)
        list(GET copy_lib_DSTS ${index} dst)
-        
        if (WIN32) 
        # windows cmd shell will not expand wildcard automatically.
        # below expand the files,libs and copy them by rules.
@@ -52,17 +51,10 @@ function(copy TARGET)
          COMMENT "copying ${src_file} -> ${dst}")
        endforeach()
        else(WIN32) # not windows
-	add_custom_command(TARGET ${TARGET} PRE_BUILD 
+          add_custom_command(TARGET ${TARGET} PRE_BUILD 
          COMMAND mkdir -p "${dst}"
          COMMAND cp -r "${src}" "${dst}"
          COMMENT "copying ${src} -> ${dst}")
-        #add_custom_command(TARGET ${TARGET} PRE_BUILD 
-        #  COMMAND ${CMAKE_COMMAND} -E make_directory  "${dst}")
-	#message("mkdir " ${TARGET})
-        #add_custom_command(TARGET ${TARGET} PRE_BUILD 
-        #  COMMAND ${CMAKE_COMMAND} -E make_directory  "${dst}"
-        #  COMMAND ${CMAKE_COMMAND} -E copy_directory "${src_files}" "${dst}"
-        #  COMMENT "copying ${src} -> ${dst}")
        endif(WIN32)
    endforeach()
 endfunction()
@@ -96,6 +88,13 @@ copy(boost_lib
  DEPS boost
 )

+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
+copy(xxhash_lib
+  SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS xxhash
+)
+
 if(NOT PROTOBUF_FOUND)
    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
    copy(protobuf_lib
@@ -157,16 +156,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 if (NOT WIN32)
-copy(framework_lib DEPS framework_py_proto 
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
-)
-else()
-copy(framework_lib
+set(framework_lib_deps framework_py_proto)
+endif(NOT WIN32)
+copy(framework_lib DEPS ${framework_lib_deps}
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+       ${src_dir}/${module}/ir/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
 )
-endif(NOT WIN32)

 set(module "memory")
 copy(memory_lib
@@ -177,19 +173,20 @@ copy(memory_lib
 set(inference_deps paddle_fluid_shared paddle_fluid)

 set(module "inference/api")
-if (WITH_ANAKIN AND WITH_GPU)
+if (WITH_ANAKIN AND WITH_MKL)
    copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
        SRCS
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
        ${ANAKIN_INSTALL_DIR} # anakin release
-        DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
+        DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
     list(APPEND inference_deps anakin_inference_lib)
 endif()

 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
+       ${src_dir}/${module}/api/paddle_inference_api.h
+       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )

@@ -216,20 +213,41 @@ copy(cmake_cache
  SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
  DSTS ${FLUID_INSTALL_DIR})

-add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
+# This command generates a complete fluid library for both train and inference
+add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
+
+# Following commands generate a inference-only fluid library
+# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
+copy(third_party DEPS fluid_lib_dist
+  SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
+  DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
+)
+
+# only need libpaddle_fluid.so/a and paddle_inference_api.h for inference-only library
+copy(inference_api_lib DEPS fluid_lib_dist
+  SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+       ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_inference_api.h
+  DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
+)
+
+add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)

 # paddle fluid version
-execute_process(
-  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-  OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-set(version_file ${FLUID_INSTALL_DIR}/version.txt)
-file(WRITE ${version_file}
-  "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
-  "WITH_MKL: ${WITH_MKL}\n"
-  "WITH_GPU: ${WITH_GPU}\n")
-if(WITH_GPU)
-  file(APPEND ${version_file}
-    "CUDA version: ${CUDA_VERSION}\n"
-    "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
-endif()
+function(version version_file)
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+    OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
+  file(WRITE ${version_file}
+    "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
+    "WITH_MKL: ${WITH_MKL}\n"
+    "WITH_MKLDNN: ${WITH_MKLDNN}\n"
+    "WITH_GPU: ${WITH_GPU}\n")
+  if(WITH_GPU)
+    file(APPEND ${version_file}
+      "CUDA version: ${CUDA_VERSION}\n"
+      "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
+  endif()
+endfunction()
+version(${FLUID_INSTALL_DIR}/version.txt)
+version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -16,7 +16,9 @@ find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
    DOC "Path to TensorRT library.")

 if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
+  if(WITH_DSO)
    set(TENSORRT_FOUND ON)
+  endif(WITH_DSO)
 else()
    set(TENSORRT_FOUND OFF)
 endif()

--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
 add_custom_target(paddle_apis ALL
-                  DEPENDS paddle_v2_apis paddle_fluid_apis)
+                  DEPENDS paddle_v2_apis)

 add_custom_target(paddle_docs ALL
                  DEPENDS paddle_v2_docs paddle_v2_docs_cn
-                  paddle_fluid_docs paddle_fluid_docs_cn
                  paddle_mobile_docs paddle_mobile_docs_cn)

 add_subdirectory(v2)
-add_subdirectory(fluid)
 add_subdirectory(mobile)
--- a/doc/README.md
+++ b/doc/README.md
+# For Readers and Developers
+
+Thanks for reading PaddlePaddle documentation. 
+
+Since **September 17th, 2018**, the **0.15.0 and develop** documentation source has been moved to [FluidDoc Repo](https://github.com/PaddlePaddle/FluidDoc) and updated there.
+
+Please turn to FluidDoc Repo for the latest documentation.
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
-if(NOT DEFINED SPHINX_THEME)
-    set(SPHINX_THEME default)
-endif()
-
-if(NOT DEFINED SPHINX_THEME_DIR)
-    set(SPHINX_THEME_DIR)
-endif()
-
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
-
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
-
-# HTML output director
-set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
-
-set(IMPORT_PADDLE_STRING "")
-set(IMPORT_PADDLEV2_STRING "")
-
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
-    "${BINARY_BUILD_DIR_EN}/conf.py"
-    @ONLY)
-
-sphinx_add_target(paddle_fluid_docs
-                  html
-                  ${BINARY_BUILD_DIR_EN}
-                  ${SPHINX_CACHE_DIR_EN}
-                  ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR_EN})
-
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
-
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
-
-# HTML output directory
-set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
-
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
-    "${BINARY_BUILD_DIR_CN}/conf.py"
-    @ONLY)
-
-sphinx_add_target(paddle_fluid_docs_cn
-                  html
-                  ${BINARY_BUILD_DIR_CN}
-                  ${SPHINX_CACHE_DIR_CN}
-                  ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR_CN})
-
-add_subdirectory(api)
--- a/doc/fluid/api/CMakeLists.txt
+++ b/doc/fluid/api/CMakeLists.txt
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
-
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
-
-# HTML output director
-set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
-
-set(IMPORT_PADDLE_STRING "import paddle")
-set(IMPORT_PADDLEV2_STRING "import paddle.v2")
-
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
-    "${BINARY_BUILD_DIR_EN}/conf.py"
-    @ONLY)
-
-sphinx_add_target(paddle_fluid_apis
-                  html
-                  ${BINARY_BUILD_DIR_EN}
-                  ${SPHINX_CACHE_DIR_EN}
-                  ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR_EN})
-
-add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
--- a/doc/fluid/api/average.rst
+++ b/doc/fluid/api/average.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-=============
-fluid.average
-=============
-
-.. _api_fluid_average_WeightedAverage:
-
-WeightedAverage
---------------
-
-..  autoclass:: paddle.fluid.average.WeightedAverage
-    :members:
-    :noindex:
-
--- a/doc/fluid/api/backward.rst
+++ b/doc/fluid/api/backward.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-==============
-fluid.backward
-==============
-
-.. _api_fluid_backward_append_backward:
-
-append_backward
---------------
-
-..  autofunction:: paddle.fluid.backward.append_backward
-    :noindex:
-
-.. _api_fluid_backward_calc_gradient:
-
-calc_gradient
-------------
-
-..  autofunction:: paddle.fluid.backward.calc_gradient
-    :noindex:
-
--- a/doc/fluid/api/clip.rst
+++ b/doc/fluid/api/clip.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-==========
-fluid.clip
-==========
-
-.. _api_fluid_clip_ErrorClipByValue:
-
-ErrorClipByValue
----------------
-
-..  autoclass:: paddle.fluid.clip.ErrorClipByValue
-    :members:
-    :noindex:
-
-.. _api_fluid_clip_GradientClipByValue:
-
-GradientClipByValue
-------------------
-
-..  autoclass:: paddle.fluid.clip.GradientClipByValue
-    :members:
-    :noindex:
-
-.. _api_fluid_clip_GradientClipByNorm:
-
-GradientClipByNorm
------------------
-
-..  autoclass:: paddle.fluid.clip.GradientClipByNorm
-    :members:
-    :noindex:
-
-.. _api_fluid_clip_GradientClipByGlobalNorm:
-
-GradientClipByGlobalNorm
------------------------
-
-..  autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
-    :members:
-    :noindex:
-
--- a/doc/fluid/api/data/data_reader.rst
+++ b/doc/fluid/api/data/data_reader.rst
-=====================
-Data Reader Interface
-=====================
-
-
-DataTypes
-=========
-
-..  autofunction:: paddle.v2.data_type.dense_array
-    :noindex:
-
-..  autofunction:: paddle.v2.data_type.integer_value
-    :noindex:
-
-..  autofunction:: paddle.v2.data_type.integer_value_sequence
-    :noindex:
-
-..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
-    :noindex:
-
-..  autofunction:: paddle.v2.data_type.sparse_binary_vector
-    :noindex:
-
-..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
-    :noindex:
-
-..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
-    :noindex:
-
-..  autofunction:: paddle.v2.data_type.sparse_float_vector
-    :noindex:
-
-..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
-    :noindex:
-
-..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
-    :noindex:
-
-..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
-    :noindex:
-
-..  autofunction:: paddle.v2.data_type.sparse_value_slot
-    :noindex:
-
-..  autoclass:: paddle.v2.data_type.InputType
-    :members:
-    :noindex:
-
-DataFeeder
-==========
-
-..  automodule:: paddle.v2.data_feeder
-    :members:
-    :noindex:
-
-Reader
-======
-
-..  automodule:: paddle.reader
-    :members:
-    :noindex:
-
-..  automodule:: paddle.reader.creator
-    :members:
-    :noindex:
-
-minibatch
-=========
-
-..  automodule:: paddle.v2.minibatch
-    :members:
-    :noindex:
--- a/doc/fluid/api/data/dataset.rst
+++ b/doc/fluid/api/data/dataset.rst
-Dataset
-=======
-
-..  automodule:: paddle.dataset
-    :members:
-    :noindex:
-
-mnist
-+++++
-
-..  automodule:: paddle.dataset.mnist
-    :members:
-    :noindex:
-
-cifar
-+++++
-
-..  automodule:: paddle.dataset.cifar
-    :members:
-    :noindex:
-
-conll05
-+++++++
-
-..  automodule:: paddle.dataset.conll05
-    :members: get_dict,get_embedding,test
-    :noindex:
-
-imdb
-++++
-
-..  automodule:: paddle.dataset.imdb
-    :members:
-    :noindex:
-
-imikolov
-++++++++
-
-..  automodule:: paddle.dataset.imikolov
-    :members:
-    :noindex:
-
-movielens
-+++++++++
-
-..  automodule:: paddle.dataset.movielens
-    :members:
-    :noindex:
-
-..  autoclass:: paddle.dataset.movielens.MovieInfo
-    :noindex:
-
-..  autoclass:: paddle.dataset.movielens.UserInfo
-    :noindex:
-
-sentiment
-+++++++++
-
-..  automodule:: paddle.dataset.sentiment
-    :members:
-    :noindex:
-
-uci_housing
-+++++++++++
-
-..  automodule:: paddle.dataset.uci_housing
-    :members:
-    :noindex:
-
-wmt14
-+++++
-
-..  automodule:: paddle.dataset.wmt14
-    :members:
-    :noindex:
-
-wmt16
-+++++
-
-..  automodule:: paddle.dataset.wmt16
-    :members:
-    :noindex:
--- a/doc/fluid/api/data/image.rst
+++ b/doc/fluid/api/data/image.rst
-Image Interface
-===============
-
-..  automodule:: paddle.v2.image
-    :members:
--- a/doc/fluid/api/data_feeder.rst
+++ b/doc/fluid/api/data_feeder.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-=================
-fluid.data_feeder
-=================
-
-.. _api_fluid_data_feeder_DataFeeder:
-
-DataFeeder
----------
-
-..  autoclass:: paddle.fluid.data_feeder.DataFeeder
-    :members:
-    :noindex:
-
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-==============
-fluid.executor
-==============
-
-.. _api_fluid_executor_Executor:
-
-Executor
--------
-
-..  autoclass:: paddle.fluid.executor.Executor
-    :members:
-    :noindex:
-
-.. _api_fluid_executor_global_scope:
-
-global_scope
------------
-
-..  autofunction:: paddle.fluid.executor.global_scope
-    :noindex:
-
-.. _api_fluid_executor_scope_guard:
-
-scope_guard
-----------
-
-..  autofunction:: paddle.fluid.executor.scope_guard
-    :noindex:
-
-.. _api_fluid_executor__switch_scope:
-
-_switch_scope
-------------
-
-..  autofunction:: paddle.fluid.executor._switch_scope
-    :noindex:
-
--- a/doc/fluid/api/fluid.rst
+++ b/doc/fluid/api/fluid.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-=====
-fluid
-=====
-
-.. _api_fluid_Block:
-
-Block
-----
-
-..  autoclass:: paddle.fluid.Block
-    :members:
-    :noindex:
-
-.. _api_fluid_Variable:
-
-Variable
--------
-
-..  autoclass:: paddle.fluid.Variable
-    :members:
-    :noindex:
-
-.. _api_fluid_Program:
-
-Program
-------
-
-..  autoclass:: paddle.fluid.Program
-    :members:
-    :noindex:
-
-.. _api_fluid_Operator:
-
-Operator
--------
-
-..  autoclass:: paddle.fluid.Operator
-    :members:
-    :noindex:
-
-.. _api_fluid_default_startup_program:
-
-default_startup_program
-----------------------
-
-..  autofunction:: paddle.fluid.default_startup_program
-    :noindex:
-
-.. _api_fluid_default_main_program:
-
-default_main_program
--------------------
-
-..  autofunction:: paddle.fluid.default_main_program
-    :noindex:
-
-.. _api_fluid_program_guard:
-
-program_guard
-------------
-
-..  autofunction:: paddle.fluid.program_guard
-    :noindex:
-
-.. _api_fluid_get_var:
-
-get_var
-------
-
-..  autofunction:: paddle.fluid.get_var
-    :noindex:
-
-.. _api_fluid_Executor:
-
-Executor
--------
-
-..  autoclass:: paddle.fluid.Executor
-    :members:
-    :noindex:
-
-.. _api_fluid_global_scope:
-
-global_scope
------------
-
-..  autofunction:: paddle.fluid.global_scope
-    :noindex:
-
-.. _api_fluid_scope_guard:
-
-scope_guard
-----------
-
-..  autofunction:: paddle.fluid.scope_guard
-    :noindex:
-
-.. _api_fluid__switch_scope:
-
-_switch_scope
-------------
-
-..  autofunction:: paddle.fluid._switch_scope
-    :noindex:
-
-
-.. _api_fluid_make_channel:
-
-make_channel
------------
-
-..  autofunction:: paddle.fluid.make_channel
-    :noindex:
-
-.. _api_fluid_channel_send:
-
-channel_send
------------
-
-..  autofunction:: paddle.fluid.channel_send
-    :noindex:
-
-.. _api_fluid_channel_recv:
-
-channel_recv
------------
-
-..  autofunction:: paddle.fluid.channel_recv
-    :noindex:
-
-.. _api_fluid_channel_close:
-
-channel_close
-------------
-
-..  autofunction:: paddle.fluid.channel_close
-    :noindex:
-
-.. _api_fluid_Select:
-
-Select
------
-
-..  autoclass:: paddle.fluid.Select
-    :members:
-    :noindex:
-
-.. _api_fluid_Trainer:
-
-Trainer
-------
-
-..  autoclass:: paddle.fluid.Trainer
-    :members:
-    :noindex:
-
-.. _api_fluid_BeginEpochEvent:
-
-BeginEpochEvent
---------------
-
-..  autoclass:: paddle.fluid.BeginEpochEvent
-    :members:
-    :noindex:
-
-.. _api_fluid_EndEpochEvent:
-
-EndEpochEvent
-------------
-
-..  autoclass:: paddle.fluid.EndEpochEvent
-    :members:
-    :noindex:
-
-.. _api_fluid_BeginStepEvent:
-
-BeginStepEvent
--------------
-
-..  autoclass:: paddle.fluid.BeginStepEvent
-    :members:
-    :noindex:
-
-.. _api_fluid_EndStepEvent:
-
-EndStepEvent
------------
-
-..  autoclass:: paddle.fluid.EndStepEvent
-    :members:
-    :noindex:
-
-.. _api_fluid_CheckpointConfig:
-
-CheckpointConfig
----------------
-
-..  autoclass:: paddle.fluid.CheckpointConfig
-    :members:
-    :noindex:
-
-.. _api_fluid_Inferencer:
-
-Inferencer
----------
-
-..  autoclass:: paddle.fluid.Inferencer
-    :members:
-    :noindex:
-
-.. _api_fluid_DistributeTranspiler:
-
-DistributeTranspiler
--------------------
-
-..  autoclass:: paddle.fluid.DistributeTranspiler
-    :members:
-    :noindex:
-
-.. _api_fluid_memory_optimize:
-
-memory_optimize
---------------
-
-..  autofunction:: paddle.fluid.memory_optimize
-    :noindex:
-
-.. _api_fluid_release_memory:
-
-release_memory
--------------
-
-..  autofunction:: paddle.fluid.release_memory
-    :noindex:
-
-.. _api_fluid_ParallelExecutor:
-
-ParallelExecutor
----------------
-
-..  autoclass:: paddle.fluid.ParallelExecutor
-    :members:
-    :noindex:
-
-.. _api_fluid_ExecutionStrategy:
-
-ExecutionStrategy
-----------------
-
-..  autoclass:: paddle.fluid.ExecutionStrategy
-    :members:
-    :noindex:
-
-.. _api_fluid_BuildStrategy:
-
-BuildStrategy
-------------
-
-..  autoclass:: paddle.fluid.BuildStrategy
-    :members:
-    :noindex:
-
-.. _api_fluid_create_lod_tensor:
-
-create_lod_tensor
-----------------
-
-..  autofunction:: paddle.fluid.create_lod_tensor
-    :noindex:
-
-.. _api_fluid_create_random_int_lodtensor:
-
-create_random_int_lodtensor
---------------------------
-
-..  autofunction:: paddle.fluid.create_random_int_lodtensor
-    :noindex:
-
-.. _api_fluid_LoDTensor:
-
-LoDTensor
---------
-
-..  autoclass:: paddle.fluid.LoDTensor
-    :members:
-    :noindex:
-
-.. _api_fluid_CPUPlace:
-
-CPUPlace
--------
-
-..  autoclass:: paddle.fluid.CPUPlace
-    :members:
-    :noindex:
-
-.. _api_fluid_CUDAPlace:
-
-CUDAPlace
---------
-
-..  autoclass:: paddle.fluid.CUDAPlace
-    :members:
-    :noindex:
-
-.. _api_fluid_CUDAPinnedPlace:
-
-CUDAPinnedPlace
---------------
-
-..  autoclass:: paddle.fluid.CUDAPinnedPlace
-    :members:
-    :noindex:
-
-.. _api_fluid_Tensor:
-
-Tensor
------
-
-..  autoclass:: paddle.fluid.Tensor
-    :members:
-    :noindex:
-
-.. _api_fluid_ParamAttr:
-
-ParamAttr
---------
-
-..  autoclass:: paddle.fluid.ParamAttr
-    :members:
-    :noindex:
-
-.. _api_fluid_WeightNormParamAttr:
-
-WeightNormParamAttr
-------------------
-
-..  autoclass:: paddle.fluid.WeightNormParamAttr
-    :members:
-    :noindex:
-
-.. _api_fluid_DataFeeder:
-
-DataFeeder
----------
-
-..  autoclass:: paddle.fluid.DataFeeder
-    :members:
-    :noindex:
-
-.. _api_fluid_Scope:
-
-Scope
-----
-
-..  autoclass:: paddle.fluid.Scope
-    :members:
-    :noindex:
-
--- a/doc/fluid/api/gen_doc.py
+++ b/doc/fluid/api/gen_doc.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import argparse
-import sys
-import types
-
-import paddle.fluid as fluid
-
-
-def parse_arg():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--submodules', nargs="*")
-    parser.add_argument(
-        'module', type=str, help='Generate the documentation of which module')
-    return parser.parse_args()
-
-
-class DocGenerator(object):
-    def __init__(self, module_name=None, stream=sys.stdout):
-        if module_name == "":
-            module_name = None
-        self.stream = stream
-        if module_name is None:
-            self.module_name = "fluid"
-        else:
-            self.module_name = "fluid." + module_name
-        if module_name is None:
-            self.module = fluid
-        else:
-            if not hasattr(fluid, module_name):
-                raise ValueError("Cannot find fluid.{0}".format(module_name))
-            else:
-                self.module = getattr(fluid, module_name)
-        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-''')
-
-        self._print_header_(self.module_name, dot='=', is_title=True)
-
-    def print_submodule(self, submodule_name):
-        submodule = getattr(self.module, submodule_name)
-        if submodule is None:
-            raise ValueError("Cannot find submodule {0}".format(submodule_name))
-        self.print_section(submodule_name)
-
-        for item in submodule.__all__:
-            self.print_item(item)
-
-    def print_current_module(self):
-        for item in self.module.__all__:
-            self.print_item(item)
-
-    def print_section(self, name):
-        self._print_header_(name, dot='=', is_title=False)
-
-    def print_item(self, name):
-        item = getattr(self.module, name, None)
-        if item is None:
-            return
-        if isinstance(item, types.TypeType):
-            self.print_class(name)
-        elif isinstance(item, types.FunctionType):
-            self.print_method(name)
-        else:
-            pass
-
-    def print_class(self, name):
-        self._print_ref_(name)
-        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autoclass:: paddle.{0}.{1}
-    :members:
-    :noindex:
-
-'''.format(self.module_name, name))
-
-    def print_method(self, name):
-        self._print_ref_(name)
-        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autofunction:: paddle.{0}.{1}
-    :noindex:
-
-'''.format(self.module_name, name))
-
-    def _print_header_(self, name, dot, is_title):
-        dot_line = dot * len(name)
-        if is_title:
-            self.stream.write(dot_line)
-            self.stream.write('\n')
-        self.stream.write(name)
-        self.stream.write('\n')
-        self.stream.write(dot_line)
-        self.stream.write('\n')
-        self.stream.write('\n')
-
-    def _print_ref_(self, name):
-        self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
-            self.module_name.split(".")), name))
-
-
-def main():
-    args = parse_arg()
-    gen = DocGenerator(args.module)
-    if args.submodules is None:
-        gen.print_current_module()
-    else:
-        for submodule_name in args.submodules:
-            gen.print_submodule(submodule_name)
-
-
-if __name__ == '__main__':
-    main()
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
-#!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst
-
-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler
-do
-  python gen_doc.py ${module} > ${module}.rst
-done
-
-python gen_doc.py "" > fluid.rst
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
-=============
-API Reference
-=============
-
-..  toctree::
-    :maxdepth: 1
-
-    fluid.rst
-    layers.rst
-    data_feeder.rst
-    executor.rst
-    initializer.rst
-    metrics.rst
-    nets.rst
-    clip.rst
-    optimizer.rst
-    param_attr.rst
-    profiler.rst
-    regularizer.rst
-    io.rst
-    data.rst
-    transpiler.rst
-    recordio_writer.rst
-    backward.rst
-    average.rst
-    profiler.rst
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-=================
-fluid.initializer
-=================
-
-.. _api_fluid_initializer_Constant:
-
-Constant
--------
-
-..  autoclass:: paddle.fluid.initializer.Constant
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_Uniform:
-
-Uniform
-------
-
-..  autoclass:: paddle.fluid.initializer.Uniform
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_Normal:
-
-Normal
------
-
-..  autoclass:: paddle.fluid.initializer.Normal
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_Xavier:
-
-Xavier
------
-
-..  autoclass:: paddle.fluid.initializer.Xavier
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_Bilinear:
-
-Bilinear
--------
-
-..  autoclass:: paddle.fluid.initializer.Bilinear
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_MSRA:
-
-MSRA
----
-
-..  autoclass:: paddle.fluid.initializer.MSRA
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_force_init_on_cpu:
-
-force_init_on_cpu
-----------------
-
-..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
-    :noindex:
-
-.. _api_fluid_initializer_init_on_cpu:
-
-init_on_cpu
-----------
-
-..  autofunction:: paddle.fluid.initializer.init_on_cpu
-    :noindex:
-
-.. _api_fluid_initializer_ConstantInitializer:
-
-ConstantInitializer
-------------------
-
-..  autoclass:: paddle.fluid.initializer.ConstantInitializer
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_UniformInitializer:
-
-UniformInitializer
------------------
-
-..  autoclass:: paddle.fluid.initializer.UniformInitializer
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_NormalInitializer:
-
-NormalInitializer
-----------------
-
-..  autoclass:: paddle.fluid.initializer.NormalInitializer
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_XavierInitializer:
-
-XavierInitializer
-----------------
-
-..  autoclass:: paddle.fluid.initializer.XavierInitializer
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_BilinearInitializer:
-
-BilinearInitializer
-------------------
-
-..  autoclass:: paddle.fluid.initializer.BilinearInitializer
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_MSRAInitializer:
-
-MSRAInitializer
---------------
-
-..  autoclass:: paddle.fluid.initializer.MSRAInitializer
-    :members:
-    :noindex:
-
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-========
-fluid.io
-========
-
-.. _api_fluid_io_save_vars:
-
-save_vars
---------
-
-..  autofunction:: paddle.fluid.io.save_vars
-    :noindex:
-
-.. _api_fluid_io_save_params:
-
-save_params
-----------
-
-..  autofunction:: paddle.fluid.io.save_params
-    :noindex:
-
-.. _api_fluid_io_save_persistables:
-
-save_persistables
-----------------
-
-..  autofunction:: paddle.fluid.io.save_persistables
-    :noindex:
-
-.. _api_fluid_io_load_vars:
-
-load_vars
---------
-
-..  autofunction:: paddle.fluid.io.load_vars
-    :noindex:
-
-.. _api_fluid_io_load_params:
-
-load_params
-----------
-
-..  autofunction:: paddle.fluid.io.load_params
-    :noindex:
-
-.. _api_fluid_io_load_persistables:
-
-load_persistables
-----------------
-
-..  autofunction:: paddle.fluid.io.load_persistables
-    :noindex:
-
-.. _api_fluid_io_save_inference_model:
-
-save_inference_model
--------------------
-
-..  autofunction:: paddle.fluid.io.save_inference_model
-    :noindex:
-
-.. _api_fluid_io_load_inference_model:
-
-load_inference_model
--------------------
-
-..  autofunction:: paddle.fluid.io.load_inference_model
-    :noindex:
-
-.. _api_fluid_io_get_inference_program:
-
-get_inference_program
---------------------
-
-..  autofunction:: paddle.fluid.io.get_inference_program
-    :noindex:
-
-.. _api_fluid_io_save_checkpoint:
-
-save_checkpoint
---------------
-
-..  autofunction:: paddle.fluid.io.save_checkpoint
-    :noindex:
-
-.. _api_fluid_io_load_checkpoint:
-
-load_checkpoint
---------------
-
-..  autofunction:: paddle.fluid.io.load_checkpoint
-    :noindex:
-
-.. _api_fluid_io_clean_checkpoint:
-
-clean_checkpoint
----------------
-
-..  autofunction:: paddle.fluid.io.clean_checkpoint
-    :noindex:
-
-.. _api_fluid_io_load_persist_vars_without_grad:
-
-load_persist_vars_without_grad
------------------------------
-
-..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
-    :noindex:
-
-.. _api_fluid_io_save_persist_vars_without_grad:
-
-save_persist_vars_without_grad
------------------------------
-
-..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
-    :noindex:
-
-.. _api_fluid_io_get_latest_checkpoint_serial:
-
-get_latest_checkpoint_serial
----------------------------
-
-..  autofunction:: paddle.fluid.io.get_latest_checkpoint_serial
-    :noindex:
-
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
--- a/doc/fluid/api/metrics.rst
+++ b/doc/fluid/api/metrics.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-=============
-fluid.metrics
-=============
-
-.. _api_fluid_metrics_MetricBase:
-
-MetricBase
----------
-
-..  autoclass:: paddle.fluid.metrics.MetricBase
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_CompositeMetric:
-
-CompositeMetric
---------------
-
-..  autoclass:: paddle.fluid.metrics.CompositeMetric
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_Precision:
-
-Precision
---------
-
-..  autoclass:: paddle.fluid.metrics.Precision
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_Recall:
-
-Recall
------
-
-..  autoclass:: paddle.fluid.metrics.Recall
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_Accuracy:
-
-Accuracy
--------
-
-..  autoclass:: paddle.fluid.metrics.Accuracy
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_ChunkEvaluator:
-
-ChunkEvaluator
--------------
-
-..  autoclass:: paddle.fluid.metrics.ChunkEvaluator
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_EditDistance:
-
-EditDistance
------------
-
-..  autoclass:: paddle.fluid.metrics.EditDistance
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_DetectionMAP:
-
-DetectionMAP
------------
-
-..  autoclass:: paddle.fluid.metrics.DetectionMAP
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_Auc:
-
-Auc
---
-
-..  autoclass:: paddle.fluid.metrics.Auc
-    :members:
-    :noindex:
-
--- a/doc/fluid/api/nets.rst
+++ b/doc/fluid/api/nets.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-==========
-fluid.nets
-==========
-
-.. _api_fluid_nets_simple_img_conv_pool:
-
-simple_img_conv_pool
--------------------
-
-..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
-    :noindex:
-
-.. _api_fluid_nets_sequence_conv_pool:
-
-sequence_conv_pool
------------------
-
-..  autofunction:: paddle.fluid.nets.sequence_conv_pool
-    :noindex:
-
-.. _api_fluid_nets_glu:
-
-glu
---
-
-..  autofunction:: paddle.fluid.nets.glu
-    :noindex:
-
-.. _api_fluid_nets_scaled_dot_product_attention:
-
-scaled_dot_product_attention
----------------------------
-
-..  autofunction:: paddle.fluid.nets.scaled_dot_product_attention
-    :noindex:
-
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-===============
-fluid.optimizer
-===============
-
-.. _api_fluid_optimizer_SGD:
-
-SGD
---
-
-..  autoclass:: paddle.fluid.optimizer.SGD
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Momentum:
-
-Momentum
--------
-
-..  autoclass:: paddle.fluid.optimizer.Momentum
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Adagrad:
-
-Adagrad
-------
-
-..  autoclass:: paddle.fluid.optimizer.Adagrad
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Adam:
-
-Adam
----
-
-..  autoclass:: paddle.fluid.optimizer.Adam
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Adamax:
-
-Adamax
------
-
-..  autoclass:: paddle.fluid.optimizer.Adamax
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_DecayedAdagrad:
-
-DecayedAdagrad
--------------
-
-..  autoclass:: paddle.fluid.optimizer.DecayedAdagrad
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Ftrl:
-
-Ftrl
----
-
-..  autoclass:: paddle.fluid.optimizer.Ftrl
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_SGDOptimizer:
-
-SGDOptimizer
------------
-
-..  autoclass:: paddle.fluid.optimizer.SGDOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_MomentumOptimizer:
-
-MomentumOptimizer
-----------------
-
-..  autoclass:: paddle.fluid.optimizer.MomentumOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_AdagradOptimizer:
-
-AdagradOptimizer
----------------
-
-..  autoclass:: paddle.fluid.optimizer.AdagradOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_AdamOptimizer:
-
-AdamOptimizer
-------------
-
-..  autoclass:: paddle.fluid.optimizer.AdamOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_AdamaxOptimizer:
-
-AdamaxOptimizer
---------------
-
-..  autoclass:: paddle.fluid.optimizer.AdamaxOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_DecayedAdagradOptimizer:
-
-DecayedAdagradOptimizer
-----------------------
-
-..  autoclass:: paddle.fluid.optimizer.DecayedAdagradOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_RMSPropOptimizer:
-
-RMSPropOptimizer
----------------
-
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_FtrlOptimizer:
-
-FtrlOptimizer
-------------
-
-..  autoclass:: paddle.fluid.optimizer.FtrlOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Adadelta:
-
-Adadelta
--------
-
-..  autoclass:: paddle.fluid.optimizer.Adadelta
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_ModelAverage:
-
-ModelAverage
------------
-
-..  autoclass:: paddle.fluid.optimizer.ModelAverage
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Optimizer:
-
-Optimizer
---------
-
-..  autoclass:: paddle.fluid.optimizer.Optimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_RMSPropOptimizer:
-
-RMSPropOptimizer
----------------
-
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
-    :members:
-    :noindex:
-
--- a/doc/fluid/api/param_attr.rst
+++ b/doc/fluid/api/param_attr.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-================
-fluid.param_attr
-================
-
-.. _api_fluid_param_attr_ParamAttr:
-
-ParamAttr
---------
-
-..  autoclass:: paddle.fluid.param_attr.ParamAttr
-    :members:
-    :noindex:
-
-.. _api_fluid_param_attr_WeightNormParamAttr:
-
-WeightNormParamAttr
-------------------
-
-..  autoclass:: paddle.fluid.param_attr.WeightNormParamAttr
-    :members:
-    :noindex:
-
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-==============
-fluid.profiler
-==============
-
-.. _api_fluid_profiler_cuda_profiler:
-
-cuda_profiler
-------------
-
-..  autofunction:: paddle.fluid.profiler.cuda_profiler
-    :noindex:
-
-.. _api_fluid_profiler_reset_profiler:
-
-reset_profiler
--------------
-
-..  autofunction:: paddle.fluid.profiler.reset_profiler
-    :noindex:
-
-.. _api_fluid_profiler_profiler:
-
-profiler
--------
-
-..  autofunction:: paddle.fluid.profiler.profiler
-    :noindex:
-
-.. _api_fluid_profiler_start_profiler:
-
-start_profiler
--------------
-
-..  autofunction:: paddle.fluid.profiler.start_profiler
-    :noindex:
-
-.. _api_fluid_profiler_stop_profiler:
-
-stop_profiler
-------------
-
-..  autofunction:: paddle.fluid.profiler.stop_profiler
-    :noindex:
-
--- a/doc/fluid/api/recordio_writer.rst
+++ b/doc/fluid/api/recordio_writer.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-=====================
-fluid.recordio_writer
-=====================
-
-.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
-
-convert_reader_to_recordio_file
-------------------------------
-
-..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
-    :noindex:
-
-.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
-
-convert_reader_to_recordio_files
--------------------------------
-
-..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
-    :noindex:
-
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-=================
-fluid.regularizer
-=================
-
-.. _api_fluid_regularizer_append_regularization_ops:
-
-append_regularization_ops
-------------------------
-
-..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
-    :noindex:
-
-.. _api_fluid_regularizer_L1Decay:
-
-L1Decay
-------
-
-..  autoclass:: paddle.fluid.regularizer.L1Decay
-    :members:
-    :noindex:
-
-.. _api_fluid_regularizer_L2Decay:
-
-L2Decay
-------
-
-..  autoclass:: paddle.fluid.regularizer.L2Decay
-    :members:
-    :noindex:
-
-.. _api_fluid_regularizer_L1DecayRegularizer:
-
-L1DecayRegularizer
------------------
-
-..  autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
-    :members:
-    :noindex:
-
-.. _api_fluid_regularizer_L2DecayRegularizer:
-
-L2DecayRegularizer
------------------
-
-..  autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
-    :members:
-    :noindex:
-
--- a/doc/fluid/api/transpiler.rst
+++ b/doc/fluid/api/transpiler.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-================
-fluid.transpiler
-================
-
-.. _api_fluid_transpiler_DistributeTranspiler:
-
-DistributeTranspiler
--------------------
-
-..  autoclass:: paddle.fluid.transpiler.DistributeTranspiler
-    :members:
-    :noindex:
-
-.. _api_fluid_transpiler_InferenceTranspiler:
-
-InferenceTranspiler
-------------------
-
-..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
-    :members:
-    :noindex:
-
-.. _api_fluid_transpiler_memory_optimize:
-
-memory_optimize
---------------
-
-..  autofunction:: paddle.fluid.transpiler.memory_optimize
-    :noindex:
-
-.. _api_fluid_transpiler_release_memory:
-
-release_memory
--------------
-
-..  autofunction:: paddle.fluid.transpiler.release_memory
-    :noindex:
-
-.. _api_fluid_transpiler_HashName:
-
-HashName
--------
-
-..  autoclass:: paddle.fluid.transpiler.HashName
-    :members:
-    :noindex:
-
-.. _api_fluid_transpiler_RoundRobin:
-
-RoundRobin
----------
-
-..  autoclass:: paddle.fluid.transpiler.RoundRobin
-    :members:
-    :noindex:
-
--- a/doc/fluid/build_and_install/build_from_source_cn.rst
+++ b/doc/fluid/build_and_install/build_from_source_cn.rst
-../../v2/build_and_install/build_from_source_cn.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/build_from_source_en.rst
+++ b/doc/fluid/build_and_install/build_from_source_en.rst
-../../v2/build_and_install/build_from_source_en.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/docker_install_cn.rst
+++ b/doc/fluid/build_and_install/docker_install_cn.rst
-../../v2/build_and_install/docker_install_cn.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/docker_install_en.rst
+++ b/doc/fluid/build_and_install/docker_install_en.rst
-../../v2/build_and_install/docker_install_en.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/index_cn.rst
+++ b/doc/fluid/build_and_install/index_cn.rst
-../../v2/build_and_install/index_cn.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/index_en.rst
+++ b/doc/fluid/build_and_install/index_en.rst
-../../v2/build_and_install/index_en.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/paddleci.png
+++ b/doc/fluid/build_and_install/paddleci.png
-../../v2/build_and_install/paddleci.png
\ No newline at end of file
--- a/doc/fluid/build_and_install/pip_install_cn.rst
+++ b/doc/fluid/build_and_install/pip_install_cn.rst
-../../v2/build_and_install/pip_install_cn.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/pip_install_en.rst
+++ b/doc/fluid/build_and_install/pip_install_en.rst
-../../v2/build_and_install/pip_install_en.rst
\ No newline at end of file
--- a/doc/fluid/design/algorithm/images/asgd.gif
+++ b/doc/fluid/design/algorithm/images/asgd.gif
--- a/doc/fluid/design/algorithm/images/theta_star.gif
+++ b/doc/fluid/design/algorithm/images/theta_star.gif
--- a/doc/fluid/design/algorithm/index_cn.rst
+++ b/doc/fluid/design/algorithm/index_cn.rst
-梯度更新算法
------------
-
-.. toctree::
-  :maxdepth: 1
-
-  parameter_average.md
--- a/doc/fluid/design/algorithm/index_en.rst
+++ b/doc/fluid/design/algorithm/index_en.rst
-Gradient Update Algorithm
--------------------------------------
-
-.. toctree::
-  :maxdepth: 1
-
-  parameter_average.md
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
-# Averaging Parameter in PaddlePaddle
-
-## Why Averaging
-In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible.
-
-Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
-
-Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/theta_star.gif"/><br/> . The averaging is done as follows:
-
-<p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/asgd.gif"><br />
-</p>
-
-We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
-
-### How to perform Parameter Averaging in PaddlePaddle
-
-Parameter Averaging in PaddlePaddle works in the following way during training :
-1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer
-2. The optimizer itself is responsible for updating the parameters.
-3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
-    1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches.
-    2. However, saving all N instances of the parameters in memory is not feasible.
-    3. Therefore, an approximation algorithm is used.
-
-Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
-
-During the testing/saving the model phase, we perform the following steps:
-1. Perform the delayed operations.
-2. Save current values of the parameters to a temporary variable.
-3. Replace the values of the parameters with the averaged values.
-4. Perform testing and/or save the parameters.
-5. Restore the values of the parameters once done.
-
-### How to implement Averaging of Parameter in PaddlePaddle
-
-We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training.
-
-	**Advantages**:
-    - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op.
-    - Makes it easy for the users to customize and extend the framework.
-
-	**Disadvantages**:
-    - Implementation requires re-writing the averaging methodology in Python.  
-
-### Low-Level implementation
-
-In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input:
- the optimizer
- the window_size to keep the updates
-
-The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
-
-The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
-
-### Python API implementation for ParameterAverageOptimizer
-
-Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following:
- Any optimizer (RMSProp , AdaGrad etc.)
- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
-
-Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
-We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.cc)
-
-#### Creation of the ParameterAverageOptimizer operator
-There are two ways for creating the ParameterAverageOptimizer op:
-1. We create the op immediately while building the computation graph.
-2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added.
-
-The proposal is to add the op immediately while building the computation graph.
-
-#### High-level API
-
-In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
--- a/doc/fluid/design/concepts/README.md
+++ b/doc/fluid/design/concepts/README.md
-A few months ago when we were trying to replace CMake with Bazel, @emailweixu suggested that we rewrite those handy Bazel functions using CMake. Now it seems that it's the right time to get this done, as we are facing problems from the porting of Majel and the development of new the parameter server using Go and C++.
-
-Here are some initial thoughts. Your comments are welcome!
-
-# Required CMake Function
-
-I think we need only the following few CMake functions to make a project description mean and clean:
-
-<table>
-<thead>
-<tr>
-<th>C++</th>
-<th>CUDA C++</th>
-<th>Go</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td>cc_library </td>
-<td>nv_library </td>
-<td>go_library </td>
-</tr>
-<tr>
-<td>cc_binary </td>
-<td>nv_binary </td>
-<td>go_binary </td>
-</tr>
-<tr>
-<td> cc_test </td>
-<td> nv_test </td>
-<td> go_test </td>
-</tr>
-</tbody>
-</table>
-
-
- The `_library` functions generate  .a files from source code.
- The `_binary` functions generate executable binary files.
- The `_test` functions generate executable unit test files. They work like `_binary` but links `-lgtest` and `-lgtest_main`.
-
-The difference between `nv_` functions and `cc_` functions is that the former use `nvcc` instead of the system-default C++ compiler.
-
-Both `nv_` and `cc_` functions enables C++11 (-std=c++11).
-
-Also,
-
- to describe external dependencies, we need `external_library`.
- to build shared libraries, we need `shared_library`.
-
-## An Example Project
-
-Suppose that we have aforementioned functions defined in our `/cmake` directory.  The following example `CMakeLists.txt` describes a project including the following source files:
-
- tensor.h
- tensor.cc
- tensor_test.cc
- ops.h
- ops.cu
- ops_test.cu
- api.go
- api_test.go
-
-Suppose that ops.cu depends on CUDNN.
-
-```cmake
-# cc_binary parses tensor.cc and figures out that target also depend
-# on tensor.h.
-cc_binary(tensor
-  SRCS
-  tensor.cc)
-
-# The dependency to target tensor implies that if any of
-# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
-cc_test(tensor_test
-  SRCS
-  tensor_test.cc
-  DEPS
-  tensor)
-
-# I don't have a clear idea what parameters external_library need to
-# have.  @gangliao as a CMake expert would have better ideas.
-external_library(cudnn
-  ....)
-
-# Suppose that ops.cu depends on external target CUDNN.  Also, ops.cu
-# include global functions that take Tensor as their parameters, so
-# ops depend on tensor.  This implies that if any of tensor.{h.cc},
-# ops.{h,cu} is changed, ops need to be re-built.
-nv_library(ops
-  SRCS
-  ops.cu
-  DEPS
-  tensor
-  cudnn)  # cudnn is defined later.
-
-nv_test(ops_test
-  SRCS
-  ops_test.cu
-  DEPS
-  ops)
-
-# Because api.go defines a GO wrapper to ops and tensor, it depends on
-# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
-# api.go is changed, api need to be re-built.
-go_library(api
-  SRCS
-  api.go
-  DEPS
-  tensor # Because ops depend on tensor, this line is optional.
-  ops)
-
-go_test(api_test
-  SRCS
-  api_test.go
-  DEPS
-  api)
-
-
-# This builds libapi.so.  shared_library might use CMake target
-# api_shared so to distinguish it from above target api.
-shared_library(api
-  DEPS
-  api)
-
-```
-
-## Implementation
-
-As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
-
-## Using Package Manager For Go
-
-Building Go binaries and libraries need to satisfy their dependencies, generally
-we can do `go get ./...` to download and compile all external dependencies. The
-problems are:
-
-1. `go get` will always get the latest code from the default branch of the
-    remote repo, so changes of dependents might break the build. This is very
-    different with what we already have in `cmake/external` which download a
-    specific version or commit id of the dependency.
-1. Some locations can not access external dependencies through the internet, as mentioned
-   in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management
-   tools can package the dependencies as a "vendor" package, which can be mirrored
-   at many cloud file hosting, so users what to compile paddle by themselves can
-   download this "vendor" package from a mirror site.
-
-### Choose A Suitable Tool
-
-As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
-list dozens of Go package managers. We choose the tool using following principles:
-
- Most "active" projects with more stars, more pull requests or commits
- Widely used project
-
-After comparing all these projects, we shall choose between the most popular
-tools: Godep and Glide.
-
-Here's a brief comparison between Godep and Glide
-: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are
-also many complaints about using `Godep`. There's also a new "official" pakcage
-management tool has been started at: https://github.com/golang/dep to resolve
-such problems, but it's currently at Alpha stage. So the best choice now is
-glide obviously.
-
-### Manage Go Packages
-
- Dependencies: `go/glide.yaml` will store the dependencies and their versions which
-  is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
-  with their commit id. Builds will "lock" to these packages if we don't `glide up`
-  them
- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake`
-  will download the code corresponding to `go/glide.lock`. If we put a vendor folder
-  under `go/`, cmake will just check the commit id to the packages under the folder,
-  if commit id matches, there will be no download at all.
--- a/doc/fluid/design/concepts/block.md
+++ b/doc/fluid/design/concepts/block.md
--- a/doc/fluid/design/concepts/cpp_data_feeding.md
+++ b/doc/fluid/design/concepts/cpp_data_feeding.md
--- a/doc/fluid/design/concepts/executor.md
+++ b/doc/fluid/design/concepts/executor.md
-# Executor Design Doc
-
-## Motivation
-In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
-[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
-
-The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
-
-## Overview
-
-An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
-
-## Executor
-
-The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
-It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
-
-### The interface
-```c++
-  Executor(places);
-```
-A executor does not own any computing resources, a user can only construct an executor using the specified places.
-
-### Running an Executor
-
-```
-  void Run(ProgramDesc, Scope, block_id, create_local_scope);
-```
-An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
-# Design Doc: Functions, Operators, and Layers
-
-In a DL system, we can compose one or more fine grained operators into a coarse grained one.  For example, the FC layer can be composed of a multiplication operator and an add operator.
-
-Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers.  But we need a well-defined separation.
-
-In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions:
-
-```c++
-template <typename T> T add(T x, T y) { return x + y; }
-template <typename T> T mul(T x, T y) { return x * y; }
-```
-
-Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name.  A C macro can do this. For example, the following macro invocation
-
-```c++
-#define MAKE_FUNCTION_OPERATOR(mul);
-```
-
-generates
-
-```c++
-template <typename T> class mulOp : public OperatorBase {...};
-REGISTER_OP(mulOp<float32>, "mul");
-```
-
-so that in Python we can create operator mul by:
-
-```python
-X1 = Var()
-X2 = Var()
-Y = Var()
-paddle.cpp.create_operator("mul", input=[X1, X2], output=Y)
-```
-
-Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`:
-
-```c++
-template <typename T>
-class FCOp : public OperatorBase {
- public:
-  void Run(...) {
-    add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b"));
-  }
-};
-REGISTER_OP(FCOp, "fc");
-```
-
-We need to support such composition in Python as well.  To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`.  This higher level operator API should be compatible with the layer API.
-
-Let's explain using an example.  Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`:
-
-```python
-def operator.mul(X1, X2):
-    O = Var()
-    paddle.cpp.create_operator("mul", input={X1, Y1}, output=O)
-    return O
-
-def operator.add(X1, X2):
-    O = Var()
-    paddle.cpp.create_operator("add", input={X1, X2}, output=O)
-    return O
-```
-
-Above code snippets are automatically generated.  Given them, users can define
-
-```python
-def layer.fc(X):
-    W = Var()
-    b = Var()
-    return operator.add(operator.mul(X, W), b)
-```
-
-If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated:
-
-```python
-def layer.fc(X):
-    W = Var()
-    b = Var()
-    O1 = Var()
-    paddle.cpp.create_operator("mul", input=[X, W], output=O1)
-    O2 = Var()
-    paddle.cpp.create_operator("add", input=[O1, b], output=O2)
-    return O2
-```
-
-We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
-
-<table>
-<thead>
-<tr>
-<th>C++ functions/functors</th>
-<th>mul</th>
-<th>add</th>
-<th></th>
-<th></th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td>C++ operator class </td>
-<td>mulOp</td>
-<td>addOp </td>
-<td>FCOp </td>
-<td></td>
-</tr>
-<tr>
-<td>Python binding  </td>
-<td>operator.mul</td>
-<td> operator.add </td>
-<td>operator.fc </td>
-<td></td>
-</tr>
-<tr>
-<td>Python function   </td>
-<td></td>
-<td></td>
-<td> </td>
-<td>layer.fc</td>
-</tr>
-</tbody>
-</table>
-
-
-This is how we differentiate layer and operators in PaddlePaddle:
-
- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas
- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers.
--- a/doc/fluid/design/concepts/images/multiple_reader.png
+++ b/doc/fluid/design/concepts/images/multiple_reader.png
--- a/doc/fluid/design/concepts/images/parallel_executor_overview.dot
+++ b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
--- a/doc/fluid/design/concepts/images/parallel_executor_overview.png
+++ b/doc/fluid/design/concepts/images/parallel_executor_overview.png
--- a/doc/fluid/design/concepts/images/readers.png
+++ b/doc/fluid/design/concepts/images/readers.png
--- a/doc/fluid/design/concepts/index_cn.rst
+++ b/doc/fluid/design/concepts/index_cn.rst
-核心概念
-------------
-
-.. toctree::
-  :maxdepth: 1
-
-  README.md
-  cpp_data_feeding.md
-  functions_operators_layers.md
-  program.md
-  variable.md
-  var_desc.md
-  tensor.md
-  tensor_array.md
-  lod_tensor.md
-  block.md
-  scope.md
-  executor.md
-  parallel_executor.md
--- a/doc/fluid/design/concepts/index_en.rst
+++ b/doc/fluid/design/concepts/index_en.rst
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
--- a/doc/fluid/design/concepts/parallel_executor.md
+++ b/doc/fluid/design/concepts/parallel_executor.md
--- a/doc/fluid/design/concepts/program.md
+++ b/doc/fluid/design/concepts/program.md
--- a/doc/fluid/design/concepts/python_data_feeding.md
+++ b/doc/fluid/design/concepts/python_data_feeding.md
--- a/doc/fluid/design/concepts/scope.md
+++ b/doc/fluid/design/concepts/scope.md
--- a/doc/fluid/design/concepts/tensor.md
+++ b/doc/fluid/design/concepts/tensor.md
--- a/doc/fluid/design/concepts/tensor_array.md
+++ b/doc/fluid/design/concepts/tensor_array.md
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
--- a/doc/fluid/design/concepts/variable.md
+++ b/doc/fluid/design/concepts/variable.md
--- a/doc/fluid/design/concurrent/channel.md
+++ b/doc/fluid/design/concurrent/channel.md
--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
--- a/doc/fluid/design/concurrent/csp.md
+++ b/doc/fluid/design/concurrent/csp.md
--- a/doc/fluid/design/concurrent/go_op.md
+++ b/doc/fluid/design/concurrent/go_op.md
--- a/doc/fluid/design/concurrent/images/channel_recv.png
+++ b/doc/fluid/design/concurrent/images/channel_recv.png
--- a/doc/fluid/design/concurrent/images/channel_send.png
+++ b/doc/fluid/design/concurrent/images/channel_send.png
--- a/doc/fluid/design/concurrent/images/select_op_workflow.png
+++ b/doc/fluid/design/concurrent/images/select_op_workflow.png
--- a/doc/fluid/design/concurrent/index_cn.rst
+++ b/doc/fluid/design/concurrent/index_cn.rst
-并发编程
------------
-
-.. toctree::
-  :maxdepth: 1
-
-  concurrent_programming.md
-  parallel_do.md
--- a/doc/fluid/design/concurrent/index_en.rst
+++ b/doc/fluid/design/concurrent/index_en.rst
--- a/doc/fluid/design/concurrent/parallel_do.md
+++ b/doc/fluid/design/concurrent/parallel_do.md
--- a/doc/fluid/design/concurrent/select_op.md
+++ b/doc/fluid/design/concurrent/select_op.md
--- a/doc/fluid/design/data_type/float16.md
+++ b/doc/fluid/design/data_type/float16.md
--- a/doc/fluid/design/data_type/index_cn.rst
+++ b/doc/fluid/design/data_type/index_cn.rst
--- a/doc/fluid/design/data_type/index_en.rst
+++ b/doc/fluid/design/data_type/index_en.rst
--- a/doc/fluid/design/dist_train/README.md
+++ b/doc/fluid/design/dist_train/README.md
--- a/doc/fluid/design/dist_train/async_update.md
+++ b/doc/fluid/design/dist_train/async_update.md
--- a/doc/fluid/design/dist_train/dist_train_nccl2.md
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
--- a/doc/fluid/design/dist_train/distributed_architecture.md
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
--- a/doc/fluid/design/dist_train/distributed_traing_review.md
+++ b/doc/fluid/design/dist_train/distributed_traing_review.md
--- a/doc/fluid/design/dist_train/index_cn.rst
+++ b/doc/fluid/design/dist_train/index_cn.rst
--- a/doc/fluid/design/dist_train/index_en.rst
+++ b/doc/fluid/design/dist_train/index_en.rst
--- a/doc/fluid/design/dist_train/mpi_enabled_design.md
+++ b/doc/fluid/design/dist_train/mpi_enabled_design.md
--- a/doc/fluid/design/dist_train/multi_cpu.md
+++ b/doc/fluid/design/dist_train/multi_cpu.md
--- a/doc/fluid/design/dist_train/parameter_server.md
+++ b/doc/fluid/design/dist_train/parameter_server.md
--- a/doc/fluid/design/dist_train/src/async_distributed_training.png
+++ b/doc/fluid/design/dist_train/src/async_distributed_training.png
--- a/doc/fluid/design/dist_train/src/async_pserver.graffle
+++ b/doc/fluid/design/dist_train/src/async_pserver.graffle
--- a/doc/fluid/design/dist_train/src/async_pserver.png
+++ b/doc/fluid/design/dist_train/src/async_pserver.png
--- a/doc/fluid/design/dist_train/src/async_update.graffle
+++ b/doc/fluid/design/dist_train/src/async_update.graffle
--- a/doc/fluid/design/dist_train/src/async_update.png
+++ b/doc/fluid/design/dist_train/src/async_update.png
--- a/doc/fluid/design/dist_train/src/compiler.graffle
+++ b/doc/fluid/design/dist_train/src/compiler.graffle
--- a/doc/fluid/design/dist_train/src/compiler.png
+++ b/doc/fluid/design/dist_train/src/compiler.png
--- a/doc/fluid/design/dist_train/src/dist-graph.graffle
+++ b/doc/fluid/design/dist_train/src/dist-graph.graffle
--- a/doc/fluid/design/dist_train/src/dist-graph.png
+++ b/doc/fluid/design/dist_train/src/dist-graph.png
--- a/doc/fluid/design/dist_train/src/distributed_architecture.graffle
+++ b/doc/fluid/design/dist_train/src/distributed_architecture.graffle
--- a/doc/fluid/design/dist_train/src/distributed_architecture.png
+++ b/doc/fluid/design/dist_train/src/distributed_architecture.png
--- a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle
+++ b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle
--- a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg
+++ b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg
--- a/doc/fluid/design/dist_train/src/distributed_training.graffle
+++ b/doc/fluid/design/dist_train/src/distributed_training.graffle
--- a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
+++ b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
--- a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
+++ b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
--- a/doc/fluid/design/dist_train/src/local-graph.graffle
+++ b/doc/fluid/design/dist_train/src/local-graph.graffle
--- a/doc/fluid/design/dist_train/src/local-graph.png
+++ b/doc/fluid/design/dist_train/src/local-graph.png
--- a/doc/fluid/design/dist_train/src/local_architecture.graffle
+++ b/doc/fluid/design/dist_train/src/local_architecture.graffle
--- a/doc/fluid/design/dist_train/src/local_architecture.png
+++ b/doc/fluid/design/dist_train/src/local_architecture.png
--- a/doc/fluid/design/dist_train/src/lookup_table.png
+++ b/doc/fluid/design/dist_train/src/lookup_table.png
--- a/doc/fluid/design/dist_train/src/lookup_table_training.png
+++ b/doc/fluid/design/dist_train/src/lookup_table_training.png
--- a/doc/fluid/design/dist_train/src/mpi_module.png
+++ b/doc/fluid/design/dist_train/src/mpi_module.png
--- a/doc/fluid/design/dist_train/src/multi-threads.graffle
+++ b/doc/fluid/design/dist_train/src/multi-threads.graffle
--- a/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png
+++ b/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png
--- a/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png
+++ b/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png
--- a/doc/fluid/design/dist_train/src/ncc2_design.graffle
+++ b/doc/fluid/design/dist_train/src/ncc2_design.graffle
--- a/doc/fluid/design/dist_train/src/ncc2_design.png
+++ b/doc/fluid/design/dist_train/src/ncc2_design.png
--- a/doc/fluid/design/dist_train/src/paddle-compile.graffle
+++ b/doc/fluid/design/dist_train/src/paddle-compile.graffle
--- a/doc/fluid/design/dist_train/src/paddle-compile.png
+++ b/doc/fluid/design/dist_train/src/paddle-compile.png
--- a/doc/fluid/design/dist_train/src/remote_executor.graffle
+++ b/doc/fluid/design/dist_train/src/remote_executor.graffle
--- a/doc/fluid/design/dist_train/src/remote_executor.png
+++ b/doc/fluid/design/dist_train/src/remote_executor.png
--- a/doc/fluid/design/dist_train/src/sparse_update.graffle
+++ b/doc/fluid/design/dist_train/src/sparse_update.graffle
--- a/doc/fluid/design/dist_train/src/sparse_update.png
+++ b/doc/fluid/design/dist_train/src/sparse_update.png
--- a/doc/fluid/design/dist_train/src/sync_distributed_training.png
+++ b/doc/fluid/design/dist_train/src/sync_distributed_training.png
--- a/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
+++ b/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
--- a/doc/fluid/design/dynamic_rnn/2_level_rnn.png
+++ b/doc/fluid/design/dynamic_rnn/2_level_rnn.png
--- a/doc/fluid/design/dynamic_rnn/index_cn.rst
+++ b/doc/fluid/design/dynamic_rnn/index_cn.rst
--- a/doc/fluid/design/dynamic_rnn/index_en.rst
+++ b/doc/fluid/design/dynamic_rnn/index_en.rst
--- a/doc/fluid/design/dynamic_rnn/rnn.dot
+++ b/doc/fluid/design/dynamic_rnn/rnn.dot
--- a/doc/fluid/design/dynamic_rnn/rnn.jpg
+++ b/doc/fluid/design/dynamic_rnn/rnn.jpg
--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
--- a/doc/fluid/design/dynamic_rnn/rnn.png
+++ b/doc/fluid/design/dynamic_rnn/rnn.png
--- a/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
+++ b/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
--- a/doc/fluid/design/dynamic_rnn/rnn_2level_data.png
+++ b/doc/fluid/design/dynamic_rnn/rnn_2level_data.png
--- a/doc/fluid/design/dynamic_rnn/rnn_design.md
+++ b/doc/fluid/design/dynamic_rnn/rnn_design.md
--- a/doc/fluid/design/dynamic_rnn/rnn_design_en.md
+++ b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
--- a/doc/fluid/design/execution/if_else_op.md
+++ b/doc/fluid/design/execution/if_else_op.md
--- a/doc/fluid/design/execution/index_cn.rst
+++ b/doc/fluid/design/execution/index_cn.rst
--- a/doc/fluid/design/execution/index_en.rst
+++ b/doc/fluid/design/execution/index_en.rst
--- a/doc/fluid/design/execution/switch.md
+++ b/doc/fluid/design/execution/switch.md
--- a/doc/fluid/design/index_cn.rst
+++ b/doc/fluid/design/index_cn.rst
--- a/doc/fluid/design/index_en.rst
+++ b/doc/fluid/design/index_en.rst
--- a/doc/fluid/design/interface/index_cn.rst
+++ b/doc/fluid/design/interface/index_cn.rst
--- a/doc/fluid/design/interface/index_en.rst
+++ b/doc/fluid/design/interface/index_en.rst
--- a/doc/fluid/design/ir/overview.md
+++ b/doc/fluid/design/ir/overview.md
--- a/doc/fluid/design/memory/README.md
+++ b/doc/fluid/design/memory/README.md
--- a/doc/fluid/design/memory/images/control_flow_graph.png
+++ b/doc/fluid/design/memory/images/control_flow_graph.png
--- a/doc/fluid/design/memory/images/dataflow_equations.png
+++ b/doc/fluid/design/memory/images/dataflow_equations.png
--- a/doc/fluid/design/memory/images/deep_learning.png
+++ b/doc/fluid/design/memory/images/deep_learning.png
--- a/doc/fluid/design/memory/index_cn.rst
+++ b/doc/fluid/design/memory/index_cn.rst
--- a/doc/fluid/design/memory/index_en.rst
+++ b/doc/fluid/design/memory/index_en.rst
--- a/doc/fluid/design/memory/memory_optimization.md
+++ b/doc/fluid/design/memory/memory_optimization.md
--- a/doc/fluid/design/modules/backward.md
+++ b/doc/fluid/design/modules/backward.md
--- a/doc/fluid/design/modules/batch_norm_op.md
+++ b/doc/fluid/design/modules/batch_norm_op.md
--- a/doc/fluid/design/modules/evaluator.md
+++ b/doc/fluid/design/modules/evaluator.md
--- a/doc/fluid/design/modules/images/batch_norm_fork.dot
+++ b/doc/fluid/design/modules/images/batch_norm_fork.dot
--- a/doc/fluid/design/modules/images/batch_norm_fork.png
+++ b/doc/fluid/design/modules/images/batch_norm_fork.png
--- a/doc/fluid/design/modules/images/batch_norm_op_kernel.png
+++ b/doc/fluid/design/modules/images/batch_norm_op_kernel.png
--- a/doc/fluid/design/modules/images/feed_forward.png
+++ b/doc/fluid/design/modules/images/feed_forward.png
--- a/doc/fluid/design/modules/images/feed_forward_regularized.png
+++ b/doc/fluid/design/modules/images/feed_forward_regularized.png
--- a/doc/fluid/design/modules/images/l1_regularization.png
+++ b/doc/fluid/design/modules/images/l1_regularization.png
--- a/doc/fluid/design/modules/images/l2_regularization.png
+++ b/doc/fluid/design/modules/images/l2_regularization.png
--- a/doc/fluid/design/modules/images/loss_equation.png
+++ b/doc/fluid/design/modules/images/loss_equation.png
--- a/doc/fluid/design/modules/index_cn.rst
+++ b/doc/fluid/design/modules/index_cn.rst
--- a/doc/fluid/design/modules/index_en.rst
+++ b/doc/fluid/design/modules/index_en.rst
--- a/doc/fluid/design/modules/infer_var_type.md
+++ b/doc/fluid/design/modules/infer_var_type.md
--- a/doc/fluid/design/modules/net_op_design.md
+++ b/doc/fluid/design/modules/net_op_design.md
--- a/doc/fluid/design/modules/optimizer.md
+++ b/doc/fluid/design/modules/optimizer.md
--- a/doc/fluid/design/modules/prune.md
+++ b/doc/fluid/design/modules/prune.md
--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
--- a/doc/fluid/design/modules/register_grad_op.md
+++ b/doc/fluid/design/modules/register_grad_op.md
--- a/doc/fluid/design/modules/regularization.md
+++ b/doc/fluid/design/modules/regularization.md
--- a/doc/fluid/design/modules/selected_rows.md
+++ b/doc/fluid/design/modules/selected_rows.md
--- a/doc/fluid/design/motivation/api.md
+++ b/doc/fluid/design/motivation/api.md
--- a/doc/fluid/design/motivation/fluid-compiler.graffle
+++ b/doc/fluid/design/motivation/fluid-compiler.graffle
--- a/doc/fluid/design/motivation/fluid-compiler.png
+++ b/doc/fluid/design/motivation/fluid-compiler.png
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
--- a/doc/fluid/design/motivation/fluid_compiler.md
+++ b/doc/fluid/design/motivation/fluid_compiler.md
--- a/doc/fluid/design/motivation/index_cn.rst
+++ b/doc/fluid/design/motivation/index_cn.rst
--- a/doc/fluid/design/motivation/index_en.rst
+++ b/doc/fluid/design/motivation/index_en.rst
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
--- a/doc/fluid/design/multi_devices/index_cn.rst
+++ b/doc/fluid/design/multi_devices/index_cn.rst
--- a/doc/fluid/design/multi_devices/index_en.rst
+++ b/doc/fluid/design/multi_devices/index_en.rst
--- a/doc/fluid/design/multi_devices/kernel_hint_design.md
+++ b/doc/fluid/design/multi_devices/kernel_hint_design.md
--- a/doc/fluid/design/multi_devices/kernel_selection.md
+++ b/doc/fluid/design/multi_devices/kernel_selection.md
--- a/doc/fluid/design/multi_devices/operator_kernel_type.md
+++ b/doc/fluid/design/multi_devices/operator_kernel_type.md
--- a/doc/fluid/design/network/deep_speech_2.md
+++ b/doc/fluid/design/network/deep_speech_2.md
--- a/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg
+++ b/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg
--- a/doc/fluid/design/network/images/beam_search.png
+++ b/doc/fluid/design/network/images/beam_search.png
--- a/doc/fluid/design/network/images/ds2_network.png
+++ b/doc/fluid/design/network/images/ds2_network.png
--- a/doc/fluid/design/network/index_cn.rst
+++ b/doc/fluid/design/network/index_cn.rst
--- a/doc/fluid/design/network/index_en.rst
+++ b/doc/fluid/design/network/index_en.rst
--- a/doc/fluid/design/network/sequence_decoder.md
+++ b/doc/fluid/design/network/sequence_decoder.md
--- a/doc/fluid/design/onnx/images/project_structure.png
+++ b/doc/fluid/design/onnx/images/project_structure.png
--- a/doc/fluid/design/onnx/onnx_convertor.md
+++ b/doc/fluid/design/onnx/onnx_convertor.md
--- a/doc/fluid/design/others/auto_gradient_check.md
+++ b/doc/fluid/design/others/auto_gradient_check.md
--- a/doc/fluid/design/others/dcgan.png
+++ b/doc/fluid/design/others/dcgan.png
--- a/doc/fluid/design/others/gan_api.md
+++ b/doc/fluid/design/others/gan_api.md
--- a/doc/fluid/design/others/graph.md
+++ b/doc/fluid/design/others/graph.md
--- a/doc/fluid/design/others/graph_survey.md
+++ b/doc/fluid/design/others/graph_survey.md
--- a/doc/fluid/design/others/images/graph_construction_example.bash
+++ b/doc/fluid/design/others/images/graph_construction_example.bash
--- a/doc/fluid/design/others/images/graph_construction_example.dot
+++ b/doc/fluid/design/others/images/graph_construction_example.dot
--- a/doc/fluid/design/others/images/graph_construction_example_all.png
+++ b/doc/fluid/design/others/images/graph_construction_example_all.png
--- a/doc/fluid/design/others/images/graph_construction_example_forward_backward.png
+++ b/doc/fluid/design/others/images/graph_construction_example_forward_backward.png
--- a/doc/fluid/design/others/images/graph_construction_example_forward_only.png
+++ b/doc/fluid/design/others/images/graph_construction_example_forward_only.png
--- a/doc/fluid/design/others/parameters_in_cpp.md
+++ b/doc/fluid/design/others/parameters_in_cpp.md
--- a/doc/fluid/design/others/simple_op_design.md
+++ b/doc/fluid/design/others/simple_op_design.md
--- a/doc/fluid/design/others/test.dot
+++ b/doc/fluid/design/others/test.dot
--- a/doc/fluid/design/others/test.dot.png
+++ b/doc/fluid/design/others/test.dot.png
--- a/doc/fluid/design/quantization/fixed_point_quantization.md
+++ b/doc/fluid/design/quantization/fixed_point_quantization.md
--- a/doc/fluid/design/quantization/quantization_backward_and_optimization.png
+++ b/doc/fluid/design/quantization/quantization_backward_and_optimization.png
--- a/doc/fluid/design/quantization/quantization_equivalent_forward.png
+++ b/doc/fluid/design/quantization/quantization_equivalent_forward.png
--- a/doc/fluid/design/quantization/quantization_forward.png
+++ b/doc/fluid/design/quantization/quantization_forward.png
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ b/doc/fluid/dev/api_doc_std_cn.md
--- a/doc/fluid/dev/api_doc_std_en.md
+++ b/doc/fluid/dev/api_doc_std_en.md
--- a/doc/fluid/dev/ci_build_whl.png
+++ b/doc/fluid/dev/ci_build_whl.png
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
--- a/doc/fluid/dev/name_convention.md
+++ b/doc/fluid/dev/name_convention.md
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
--- a/doc/fluid/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
--- a/doc/fluid/dev/new_op_kernel.md
+++ b/doc/fluid/dev/new_op_kernel.md
--- a/doc/fluid/dev/op_markdown_format.md
+++ b/doc/fluid/dev/op_markdown_format.md
--- a/doc/fluid/dev/releasing_process_cn.md
+++ b/doc/fluid/dev/releasing_process_cn.md
--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
--- a/doc/fluid/dev/src/fc.py
+++ b/doc/fluid/dev/src/fc.py
--- a/doc/fluid/dev/support_new_device.md
+++ b/doc/fluid/dev/support_new_device.md
--- a/doc/fluid/dev/use_eigen_cn.md
+++ b/doc/fluid/dev/use_eigen_cn.md
--- a/doc/fluid/dev/use_eigen_en.md
+++ b/doc/fluid/dev/use_eigen_en.md
--- a/doc/fluid/dev/write_docs_cn.rst
+++ b/doc/fluid/dev/write_docs_cn.rst
--- a/doc/fluid/dev/write_docs_en.rst
+++ b/doc/fluid/dev/write_docs_en.rst
--- a/doc/fluid/faq/index_cn.rst
+++ b/doc/fluid/faq/index_cn.rst
--- a/doc/fluid/faq/index_en.rst
+++ b/doc/fluid/faq/index_en.rst
--- a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
+++ b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
--- a/doc/fluid/getstarted/concepts/index_cn.rst
+++ b/doc/fluid/getstarted/concepts/index_cn.rst
--- a/doc/fluid/getstarted/concepts/index_en.rst
+++ b/doc/fluid/getstarted/concepts/index_en.rst
--- a/doc/fluid/getstarted/concepts/reader/README.md
+++ b/doc/fluid/getstarted/concepts/reader/README.md
--- a/doc/fluid/getstarted/concepts/save_model/model_format.md
+++ b/doc/fluid/getstarted/concepts/save_model/model_format.md
--- a/doc/fluid/getstarted/index_cn.rst
+++ b/doc/fluid/getstarted/index_cn.rst
--- a/doc/fluid/getstarted/index_en.rst
+++ b/doc/fluid/getstarted/index_en.rst
--- a/doc/fluid/getstarted/quickstart_cn.rst
+++ b/doc/fluid/getstarted/quickstart_cn.rst
--- a/doc/fluid/getstarted/quickstart_en.rst
+++ b/doc/fluid/getstarted/quickstart_en.rst
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
--- a/doc/fluid/howto/cluster/fluid_cluster_train_en.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_en.md
--- a/doc/fluid/howto/cluster/fluid_recordio.md
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
--- a/doc/fluid/howto/cluster/nccl2_rdma_training.md
+++ b/doc/fluid/howto/cluster/nccl2_rdma_training.md
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
--- a/doc/fluid/howto/index_en.rst
+++ b/doc/fluid/howto/index_en.rst
--- a/doc/fluid/howto/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
--- a/doc/fluid/howto/inference/index_cn.rst
+++ b/doc/fluid/howto/inference/index_cn.rst
--- a/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
+++ b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
--- a/doc/fluid/howto/optimization/benchmark/index_cn.rst
+++ b/doc/fluid/howto/optimization/benchmark/index_cn.rst
--- a/doc/fluid/howto/optimization/benchmark/index_en.rst
+++ b/doc/fluid/howto/optimization/benchmark/index_en.rst
--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
--- a/doc/fluid/howto/optimization/host_memory_profiling_cn.md
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
--- a/doc/fluid/howto/optimization/index_cn.rst
+++ b/doc/fluid/howto/optimization/index_cn.rst
--- a/doc/fluid/howto/optimization/index_en.rst
+++ b/doc/fluid/howto/optimization/index_en.rst
--- a/doc/fluid/howto/optimization/pprof_1.png
+++ b/doc/fluid/howto/optimization/pprof_1.png
--- a/doc/fluid/howto/optimization/pprof_2.png
+++ b/doc/fluid/howto/optimization/pprof_2.png
--- a/doc/fluid/howto/optimization/timeline.jpeg
+++ b/doc/fluid/howto/optimization/timeline.jpeg
--- a/doc/fluid/howto/optimization/timeline_cn.md
+++ b/doc/fluid/howto/optimization/timeline_cn.md
--- a/doc/fluid/howto/optimization/timeline_en.md
+++ b/doc/fluid/howto/optimization/timeline_en.md
--- a/doc/fluid/howto/optimization/tracing.jpeg
+++ b/doc/fluid/howto/optimization/tracing.jpeg
--- a/doc/fluid/howto/performance/error_clip.md
+++ b/doc/fluid/howto/performance/error_clip.md
--- a/doc/fluid/howto/performance/images/profiler.png
+++ b/doc/fluid/howto/performance/images/profiler.png
--- a/doc/fluid/howto/performance/profiler.md
+++ b/doc/fluid/howto/performance/profiler.md
--- a/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle
+++ b/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle
--- a/doc/fluid/howto/third_party/images/multigpu_allreduce.png
+++ b/doc/fluid/howto/third_party/images/multigpu_allreduce.png
--- a/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle
+++ b/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle
--- a/doc/fluid/howto/third_party/images/multigpu_before_convert.png
+++ b/doc/fluid/howto/third_party/images/multigpu_before_convert.png
--- a/doc/fluid/howto/third_party/mkldnn_fluid.md
+++ b/doc/fluid/howto/third_party/mkldnn_fluid.md
--- a/doc/fluid/howto/third_party/paddle_nccl.md
+++ b/doc/fluid/howto/third_party/paddle_nccl.md
--- a/doc/fluid/images/1.png
+++ b/doc/fluid/images/1.png
--- a/doc/fluid/images/2.png
+++ b/doc/fluid/images/2.png
--- a/doc/fluid/images/2_level_rnn.dot
+++ b/doc/fluid/images/2_level_rnn.dot
--- a/doc/fluid/images/2_level_rnn.png
+++ b/doc/fluid/images/2_level_rnn.png
--- a/doc/fluid/images/3.png
+++ b/doc/fluid/images/3.png
--- a/doc/fluid/images/4.png
+++ b/doc/fluid/images/4.png
--- a/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg
+++ b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg
--- a/doc/fluid/images/LoDTensor.png
+++ b/doc/fluid/images/LoDTensor.png
--- a/doc/fluid/images/asgd.gif
+++ b/doc/fluid/images/asgd.gif
--- a/doc/fluid/images/batch_norm_fork.dot
+++ b/doc/fluid/images/batch_norm_fork.dot
--- a/doc/fluid/images/batch_norm_fork.png
+++ b/doc/fluid/images/batch_norm_fork.png
--- a/doc/fluid/images/batch_norm_op_kernel.png
+++ b/doc/fluid/images/batch_norm_op_kernel.png
--- a/doc/fluid/images/beam_search.png
+++ b/doc/fluid/images/beam_search.png
--- a/doc/fluid/images/ci_build_whl.png
+++ b/doc/fluid/images/ci_build_whl.png
--- a/doc/fluid/images/compile_run_time.png
+++ b/doc/fluid/images/compile_run_time.png
--- a/doc/fluid/images/compiler.graffle
+++ b/doc/fluid/images/compiler.graffle
--- a/doc/fluid/images/compiler.png
+++ b/doc/fluid/images/compiler.png
--- a/doc/fluid/images/control_flow_graph.png
+++ b/doc/fluid/images/control_flow_graph.png
--- a/doc/fluid/images/dataflow_equations.png
+++ b/doc/fluid/images/dataflow_equations.png
--- a/doc/fluid/images/dcgan.png
+++ b/doc/fluid/images/dcgan.png
--- a/doc/fluid/images/deep_learning.png
+++ b/doc/fluid/images/deep_learning.png
--- a/doc/fluid/images/dist-graph.graffle
+++ b/doc/fluid/images/dist-graph.graffle
--- a/doc/fluid/images/dist-graph.png
+++ b/doc/fluid/images/dist-graph.png
--- a/doc/fluid/images/distributed_architecture.graffle
+++ b/doc/fluid/images/distributed_architecture.graffle
--- a/doc/fluid/images/distributed_architecture.png
+++ b/doc/fluid/images/distributed_architecture.png
--- a/doc/fluid/images/ds2_network.png
+++ b/doc/fluid/images/ds2_network.png
--- a/doc/fluid/images/executor.png
+++ b/doc/fluid/images/executor.png
--- a/doc/fluid/images/feed_forward.png
+++ b/doc/fluid/images/feed_forward.png
--- a/doc/fluid/images/feed_forward_regularized.png
+++ b/doc/fluid/images/feed_forward_regularized.png
--- a/doc/fluid/images/fluid-compiler.graffle
+++ b/doc/fluid/images/fluid-compiler.graffle
--- a/doc/fluid/images/fluid-compiler.png
+++ b/doc/fluid/images/fluid-compiler.png
--- a/doc/fluid/images/fluid_examples.png
+++ b/doc/fluid/images/fluid_examples.png
--- a/doc/fluid/images/fluid_module_1.png
+++ b/doc/fluid/images/fluid_module_1.png
--- a/doc/fluid/images/fluid_module_2.png
+++ b/doc/fluid/images/fluid_module_2.png
--- a/doc/fluid/images/graph_construction_example.bash
+++ b/doc/fluid/images/graph_construction_example.bash
--- a/doc/fluid/images/graph_construction_example.dot
+++ b/doc/fluid/images/graph_construction_example.dot
--- a/doc/fluid/images/graph_construction_example_all.png
+++ b/doc/fluid/images/graph_construction_example_all.png
--- a/doc/fluid/images/graph_construction_example_forward_backward.png
+++ b/doc/fluid/images/graph_construction_example_forward_backward.png
--- a/doc/fluid/images/graph_construction_example_forward_only.png
+++ b/doc/fluid/images/graph_construction_example_forward_only.png
--- a/doc/fluid/images/l1_regularization.png
+++ b/doc/fluid/images/l1_regularization.png
--- a/doc/fluid/images/l2_regularization.png
+++ b/doc/fluid/images/l2_regularization.png
--- a/doc/fluid/images/layer.png
+++ b/doc/fluid/images/layer.png
--- a/doc/fluid/images/local-graph.graffle
+++ b/doc/fluid/images/local-graph.graffle
--- a/doc/fluid/images/local-graph.png
+++ b/doc/fluid/images/local-graph.png
--- a/doc/fluid/images/local_architecture.graffle
+++ b/doc/fluid/images/local_architecture.graffle
--- a/doc/fluid/images/local_architecture.png
+++ b/doc/fluid/images/local_architecture.png
--- a/doc/fluid/images/lookup_table.png
+++ b/doc/fluid/images/lookup_table.png
--- a/doc/fluid/images/lookup_table_training.png
+++ b/doc/fluid/images/lookup_table_training.png
--- a/doc/fluid/images/loss_equation.png
+++ b/doc/fluid/images/loss_equation.png
--- a/doc/fluid/images/multi-threads.graffle
+++ b/doc/fluid/images/multi-threads.graffle
--- a/doc/fluid/images/multi-threads@3x.png
+++ b/doc/fluid/images/multi-threads@3x.png
--- a/doc/fluid/images/multigpu_allreduce.graffle
+++ b/doc/fluid/images/multigpu_allreduce.graffle
--- a/doc/fluid/images/multigpu_allreduce.png
+++ b/doc/fluid/images/multigpu_allreduce.png
--- a/doc/fluid/images/multigpu_before_convert.graffle
+++ b/doc/fluid/images/multigpu_before_convert.graffle
--- a/doc/fluid/images/multigpu_before_convert.png
+++ b/doc/fluid/images/multigpu_before_convert.png
--- a/doc/fluid/images/multiple_reader.png
+++ b/doc/fluid/images/multiple_reader.png
--- a/doc/fluid/images/op.dot
+++ b/doc/fluid/images/op.dot
--- a/doc/fluid/images/op_op_with_kern_class_diagram.dot
+++ b/doc/fluid/images/op_op_with_kern_class_diagram.dot
--- a/doc/fluid/images/op_with_kernel.dot
+++ b/doc/fluid/images/op_with_kernel.dot
--- a/doc/fluid/images/operator1.png
+++ b/doc/fluid/images/operator1.png
--- a/doc/fluid/images/operator2.png
+++ b/doc/fluid/images/operator2.png
--- a/doc/fluid/images/paddle-compile.graffle
+++ b/doc/fluid/images/paddle-compile.graffle
--- a/doc/fluid/images/paddle-compile.png
+++ b/doc/fluid/images/paddle-compile.png
--- a/doc/fluid/images/place.png
+++ b/doc/fluid/images/place.png
--- a/doc/fluid/images/pprof_1.png
+++ b/doc/fluid/images/pprof_1.png
--- a/doc/fluid/images/pprof_2.png
+++ b/doc/fluid/images/pprof_2.png
--- a/doc/fluid/images/print_fluid_program.png
+++ b/doc/fluid/images/print_fluid_program.png
--- a/doc/fluid/images/profiler.png
+++ b/doc/fluid/images/profiler.png
--- a/doc/fluid/images/program_desc1.png
+++ b/doc/fluid/images/program_desc1.png
--- a/doc/fluid/images/program_desc2.png
+++ b/doc/fluid/images/program_desc2.png
--- a/doc/fluid/images/raw_input.png
+++ b/doc/fluid/images/raw_input.png
--- a/doc/fluid/images/readers.png
+++ b/doc/fluid/images/readers.png
--- a/doc/fluid/images/remote_executor.graffle
+++ b/doc/fluid/images/remote_executor.graffle
--- a/doc/fluid/images/remote_executor.png
+++ b/doc/fluid/images/remote_executor.png
--- a/doc/fluid/images/rnn.dot
+++ b/doc/fluid/images/rnn.dot
--- a/doc/fluid/images/rnn.jpg
+++ b/doc/fluid/images/rnn.jpg
--- a/doc/fluid/images/rnn.png
+++ b/doc/fluid/images/rnn.png
--- a/doc/fluid/images/rnn_2level_data.dot
+++ b/doc/fluid/images/rnn_2level_data.dot
--- a/doc/fluid/images/rnn_2level_data.png
+++ b/doc/fluid/images/rnn_2level_data.png
--- a/doc/fluid/images/scope_variable_tensor.png
+++ b/doc/fluid/images/scope_variable_tensor.png
--- a/doc/fluid/images/single-thread@3x.png
+++ b/doc/fluid/images/single-thread@3x.png
--- a/doc/fluid/images/sorted_input.png
+++ b/doc/fluid/images/sorted_input.png
--- a/doc/fluid/images/sparse_update.graffle
+++ b/doc/fluid/images/sparse_update.graffle
--- a/doc/fluid/images/sparse_update.png
+++ b/doc/fluid/images/sparse_update.png
--- a/doc/fluid/images/test.dot
+++ b/doc/fluid/images/test.dot
--- a/doc/fluid/images/test.dot.png
+++ b/doc/fluid/images/test.dot.png
--- a/doc/fluid/images/theta_star.gif
+++ b/doc/fluid/images/theta_star.gif
--- a/doc/fluid/images/timeline.jpeg
+++ b/doc/fluid/images/timeline.jpeg
--- a/doc/fluid/images/tracing.jpeg
+++ b/doc/fluid/images/tracing.jpeg
--- a/doc/fluid/images/transpiler.png
+++ b/doc/fluid/images/transpiler.png
--- a/doc/fluid/images/user_interface.png
+++ b/doc/fluid/images/user_interface.png
--- a/doc/fluid/index_cn.rst
+++ b/doc/fluid/index_cn.rst
--- a/doc/fluid/index_en.rst
+++ b/doc/fluid/index_en.rst
--- a/doc/fluid/new_docs/advanced_usage/benchmark.rst
+++ b/doc/fluid/new_docs/advanced_usage/benchmark.rst
--- a/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/build_and_install_lib_cn.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/build_and_install_lib_cn.rst
--- a/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
--- a/doc/fluid/new_docs/advanced_usage/deploy/index_mobile.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_mobile.rst
--- a/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst
--- a/doc/fluid/new_docs/advanced_usage/deploy/install_anakin.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/install_anakin.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/mobile_build.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/mobile_build.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/mobile_dev.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/mobile_dev.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
--- a/doc/fluid/new_docs/advanced_usage/deploy/run_anakin_on_arm.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/run_anakin_on_arm.md
--- a/doc/fluid/new_docs/advanced_usage/development/gpu_profiling_cn.rst
+++ b/doc/fluid/new_docs/advanced_usage/development/gpu_profiling_cn.rst
--- a/doc/fluid/new_docs/advanced_usage/development/nvvp1.png
+++ b/doc/fluid/new_docs/advanced_usage/development/nvvp1.png
--- a/doc/fluid/new_docs/advanced_usage/development/nvvp2.png
+++ b/doc/fluid/new_docs/advanced_usage/development/nvvp2.png
--- a/doc/fluid/new_docs/advanced_usage/development/nvvp3.png
+++ b/doc/fluid/new_docs/advanced_usage/development/nvvp3.png
--- a/doc/fluid/new_docs/advanced_usage/development/nvvp4.png
+++ b/doc/fluid/new_docs/advanced_usage/development/nvvp4.png
--- a/doc/fluid/new_docs/advanced_usage/development/pprof_1.png
+++ b/doc/fluid/new_docs/advanced_usage/development/pprof_1.png
--- a/doc/fluid/new_docs/advanced_usage/development/pprof_2.png
+++ b/doc/fluid/new_docs/advanced_usage/development/pprof_2.png
--- a/doc/fluid/new_docs/advanced_usage/development/timeline.jpeg
+++ b/doc/fluid/new_docs/advanced_usage/development/timeline.jpeg
--- a/doc/fluid/new_docs/advanced_usage/development/tracing.jpeg
+++ b/doc/fluid/new_docs/advanced_usage/development/tracing.jpeg
--- a/doc/fluid/new_docs/advanced_usage/development/write_docs.rst
+++ b/doc/fluid/new_docs/advanced_usage/development/write_docs.rst
--- a/doc/fluid/new_docs/advanced_usage/index.rst
+++ b/doc/fluid/new_docs/advanced_usage/index.rst
--- a/doc/fluid/new_docs/advanced_usage/pics/anakin_fm_ch.png
+++ b/doc/fluid/new_docs/advanced_usage/pics/anakin_fm_ch.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/basics/index.rst
+++ b/doc/fluid/new_docs/beginners_guide/basics/index.rst
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/basics/learning_materials.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/learning_materials.md
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/index.rst
+++ b/doc/fluid/new_docs/beginners_guide/index.rst
--- a/doc/fluid/new_docs/beginners_guide/install/install_doc.rst
+++ b/doc/fluid/new_docs/beginners_guide/install/install_doc.rst
--- a/doc/fluid/new_docs/beginners_guide/install/paddleci.png
+++ b/doc/fluid/new_docs/beginners_guide/install/paddleci.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/quick_start/index.rst
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/index.rst
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
--- a/doc/fluid/new_docs/faq/faq.rst
+++ b/doc/fluid/new_docs/faq/faq.rst
--- a/doc/fluid/new_docs/faq/index_cn.rst
+++ b/doc/fluid/new_docs/faq/index_cn.rst
--- a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_basic_concept.rst
+++ b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_basic_concept.rst
--- a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_local_train.jpeg
+++ b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_local_train.jpeg
--- a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_mnist.png
+++ b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_mnist.png
--- a/doc/fluid/new_docs/user_guides/howto/configure_simple_model/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/configure_simple_model/index.rst
--- a/doc/fluid/new_docs/user_guides/howto/debug/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/debug/index.rst
--- a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
+++ b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
--- a/doc/fluid/new_docs/user_guides/howto/evaluation/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/evaluation/index.rst
--- a/doc/fluid/new_docs/user_guides/howto/evaluation/metrics.rst
+++ b/doc/fluid/new_docs/user_guides/howto/evaluation/metrics.rst
--- a/doc/fluid/new_docs/user_guides/howto/modification/foo.rst
+++ b/doc/fluid/new_docs/user_guides/howto/modification/foo.rst
--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/feeding_data.rst
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/feeding_data.rst
--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/reader.md
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/reader.md
--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
--- a/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_cn.md
+++ b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_cn.md
--- a/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_en.md
+++ b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_en.md
--- a/doc/fluid/new_docs/user_guides/howto/training/cluster_howto.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/cluster_howto.rst
--- a/doc/fluid/new_docs/user_guides/howto/training/cluster_quick_start.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/cluster_quick_start.rst
--- a/doc/fluid/new_docs/user_guides/howto/training/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/index.rst
--- a/doc/fluid/new_docs/user_guides/howto/training/multi_node.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/multi_node.rst
--- a/doc/fluid/new_docs/user_guides/howto/training/save_load_variables.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/save_load_variables.rst
--- a/doc/fluid/new_docs/user_guides/howto/training/single_node.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/single_node.rst
--- a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.graffle
+++ b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.graffle
--- a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.png
+++ b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.png
--- a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.graffle
+++ b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.graffle
--- a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.png
+++ b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.png
--- a/doc/fluid/new_docs/user_guides/howto/training/src/parallelism.png
+++ b/doc/fluid/new_docs/user_guides/howto/training/src/parallelism.png
--- a/doc/fluid/new_docs/user_guides/howto/training/test_while_training.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/test_while_training.rst
--- a/doc/fluid/new_docs/user_guides/index.rst
+++ b/doc/fluid/new_docs/user_guides/index.rst
--- a/doc/fluid/new_docs/user_guides/models/index.rst
+++ b/doc/fluid/new_docs/user_guides/models/index.rst
--- a/doc/fluid/read_source.md
+++ b/doc/fluid/read_source.md
--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
--- a/paddle/fluid/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
--- a/paddle/fluid/framework/details/cow_ptr_test.cc
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
--- a/paddle/fluid/framework/details/reference_count_op_handle.h
+++ b/paddle/fluid/framework/details/reference_count_op_handle.h
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
--- a/paddle/fluid/framework/details/reference_count_pass.h
+++ b/paddle/fluid/framework/details/reference_count_pass.h
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
--- a/paddle/fluid/framework/ir/fuse_pass_base.cc
+++ b/paddle/fluid/framework/ir/fuse_pass_base.cc
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
--- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.h
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
--- a/paddle/fluid/framework/ir/pass_builder.cc
+++ b/paddle/fluid/framework/ir/pass_builder.cc
--- a/paddle/fluid/framework/ir/pass_builder.h
+++ b/paddle/fluid/framework/ir/pass_builder.h
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
--- a/paddle/fluid/framework/lod_tensor_array.h
+++ b/paddle/fluid/framework/lod_tensor_array.h
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
--- a/paddle/fluid/framework/naive_executor_test.cc
+++ b/paddle/fluid/framework/naive_executor_test.cc
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
--- a/paddle/fluid/framework/prune.h
+++ b/paddle/fluid/framework/prune.h
--- a/paddle/fluid/framework/reader_test.cc
+++ b/paddle/fluid/framework/reader_test.cc
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
--- a/paddle/fluid/framework/threadpool_test.cc
+++ b/paddle/fluid/framework/threadpool_test.cc
--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
--- a/paddle/fluid/framework/version.cc
+++ b/paddle/fluid/framework/version.cc
--- a/paddle/fluid/framework/version.h
+++ b/paddle/fluid/framework/version.h
--- a/paddle/fluid/framework/version_test.cc
+++ b/paddle/fluid/framework/version_test.cc
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/pass.cc
+++ b/paddle/fluid/inference/analysis/pass.cc
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
--- a/paddle/fluid/inference/analysis/flags.h
+++ b/paddle/fluid/inference/analysis/flags.h
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
--- a/paddle/fluid/inference/analysis/model_store_pass.h
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ b/paddle/fluid/inference/analysis/pass_manager.h
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
--- a/paddle/fluid/inference/api/demo_ci/windows_inference.md
+++ b/paddle/fluid/inference/api/demo_ci/windows_inference.md
--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
--- a/paddle/fluid/inference/paddle_fluid.map
+++ b/paddle/fluid/inference/paddle_fluid.map
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
--- a/paddle/fluid/inference/api/api_anakin_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
--- a/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
--- a/paddle/fluid/inference/analysis/chinese_ner_tester.cc
+++ b/paddle/fluid/inference/analysis/chinese_ner_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
--- a/paddle/fluid/operators/adadelta_op.h
+++ b/paddle/fluid/operators/adadelta_op.h
--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
--- a/paddle/fluid/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
--- a/paddle/fluid/operators/adamax_op.h
+++ b/paddle/fluid/operators/adamax_op.h
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
--- a/paddle/fluid/operators/argsort_op.cc
+++ b/paddle/fluid/operators/argsort_op.cc
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
--- a/paddle/fluid/operators/channel_create_op.cc
+++ b/paddle/fluid/operators/channel_create_op.cc
--- a/paddle/fluid/operators/channel_recv_op.cc
+++ b/paddle/fluid/operators/channel_recv_op.cc
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
--- a/paddle/fluid/operators/concurrency/CMakeLists.txt
+++ b/paddle/fluid/operators/concurrency/CMakeLists.txt
--- a/paddle/fluid/operators/concurrency/channel_util.cc
+++ b/paddle/fluid/operators/concurrency/channel_util.cc
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
--- a/paddle/fluid/operators/cub_reduce.h
+++ b/paddle/fluid/operators/cub_reduce.h
--- a/paddle/fluid/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
--- a/paddle/fluid/operators/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/decayed_adagrad_op.h
--- a/paddle/fluid/operators/detail/safe_ref.h
+++ b/paddle/fluid/operators/detail/safe_ref.h
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
--- a/paddle/fluid/operators/detection/gpc.h
+++ b/paddle/fluid/operators/detection/gpc.h
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
--- a/paddle/fluid/operators/detection/poly_util.cc
+++ b/paddle/fluid/operators/detection/poly_util.cc
--- a/paddle/fluid/operators/detection/poly_util.h
+++ b/paddle/fluid/operators/detection/poly_util.h
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
--- a/paddle/fluid/operators/distributed/varhandle_test.cc
+++ b/paddle/fluid/operators/distributed/varhandle_test.cc
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
--- a/paddle/fluid/operators/extract_rows_op.cc
+++ b/paddle/fluid/operators/extract_rows_op.cc
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
--- a/paddle/fluid/operators/channel_send_op.cc
+++ b/paddle/fluid/operators/channel_send_op.cc
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
--- a/paddle/fluid/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
--- a/paddle/fluid/operators/ftrl_op.h
+++ b/paddle/fluid/operators/ftrl_op.h
--- a/paddle/fluid/operators/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc
--- a/paddle/fluid/operators/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.h
--- a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
--- a/paddle/fluid/operators/concurrency/channel_util.h
+++ b/paddle/fluid/operators/concurrency/channel_util.h
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
--- a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
--- a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h
+++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h
--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
--- a/paddle/fluid/operators/channel_close_op.cc
+++ b/paddle/fluid/operators/channel_close_op.cc
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
--- a/paddle/fluid/operators/lars_momentum_op.cc
+++ b/paddle/fluid/operators/lars_momentum_op.cc
--- a/paddle/fluid/operators/lars_momentum_op.cu
+++ b/paddle/fluid/operators/lars_momentum_op.cu
--- a/paddle/fluid/operators/lars_momentum_op.h
+++ b/paddle/fluid/operators/lars_momentum_op.h
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/algorithm.h
+++ b/paddle/fluid/operators/math/algorithm.h
--- a/paddle/fluid/operators/math/compound_functors.h
+++ b/paddle/fluid/operators/math/compound_functors.h
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
--- a/paddle/fluid/operators/math/functors.h
+++ b/paddle/fluid/operators/math/functors.h
--- a/paddle/fluid/operators/math/jit_kernel.cc
+++ b/paddle/fluid/operators/math/jit_kernel.cc
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
--- a/paddle/fluid/operators/merge_ids_op.cc
+++ b/paddle/fluid/operators/merge_ids_op.cc
--- a/paddle/fluid/operators/merge_ids_op.h
+++ b/paddle/fluid/operators/merge_ids_op.h
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
--- a/paddle/fluid/operators/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cu.cc
--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
--- a/paddle/fluid/operators/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_enumerate_op.cc
--- a/paddle/fluid/operators/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_enumerate_op.cu
--- a/paddle/fluid/operators/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_enumerate_op.h
--- a/paddle/fluid/operators/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_erase_op.cc
--- a/paddle/fluid/operators/sequence_expand_as_op.cc
+++ b/paddle/fluid/operators/sequence_expand_as_op.cc
--- a/paddle/fluid/operators/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_expand_as_op.cu
--- a/paddle/fluid/operators/sequence_expand_as_op.h
+++ b/paddle/fluid/operators/sequence_expand_as_op.h
--- a/paddle/fluid/operators/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_mask_op.cc
--- a/paddle/fluid/operators/sequence_mask_op.cu
+++ b/paddle/fluid/operators/sequence_mask_op.cu
--- a/paddle/fluid/operators/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_pad_op.cc
--- a/paddle/fluid/operators/sequence_pad_op.h
+++ b/paddle/fluid/operators/sequence_pad_op.h
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
--- a/paddle/fluid/operators/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_reshape_op.cc
--- a/paddle/fluid/operators/sequence_reverse_op.cc
+++ b/paddle/fluid/operators/sequence_reverse_op.cc
--- a/paddle/fluid/operators/sequence_reverse_op.cu
+++ b/paddle/fluid/operators/sequence_reverse_op.cu
--- a/paddle/fluid/operators/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_reverse_op.h
--- a/paddle/fluid/operators/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_scatter_op.cc
--- a/paddle/fluid/operators/sequence_scatter_op.h
+++ b/paddle/fluid/operators/sequence_scatter_op.h
--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_softmax_op.cu
--- a/paddle/fluid/operators/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_softmax_op.h
--- a/paddle/fluid/operators/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_unpad_op.cc
--- a/paddle/fluid/operators/sequence_unpad_op.cu
+++ b/paddle/fluid/operators/sequence_unpad_op.cu
--- a/paddle/fluid/operators/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_unpad_op.h
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
--- a/paddle/fluid/operators/split_ids_op.cc
+++ b/paddle/fluid/operators/split_ids_op.cc
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/macros.h
+++ b/paddle/fluid/platform/macros.h
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
--- a/paddle/fluid/string/pretty_log.cc
+++ b/paddle/fluid/string/pretty_log.cc
--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ b/paddle/fluid/train/demo/CMakeLists.txt
--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
--- a/paddle/legacy/trainer/tests/CMakeLists.txt
+++ b/paddle/legacy/trainer/tests/CMakeLists.txt
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
--- a/python/paddle/fluid/contrib/op_frequence.py
+++ b/python/paddle/fluid/contrib/op_frequence.py
--- a/python/paddle/fluid/contrib/quantize/__init__.py
+++ b/python/paddle/fluid/contrib/quantize/__init__.py