Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into revert_callstack

7119d6c3 · Yu Yang · cda7842e · aeb2dc2b · 7119d6c3 · 7119d6c3
844 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,7 +12,6 @@ services:
 os:
  - linux
 env:
-  - JOB=doc
  - JOB=check_style
  - JOB=build_android
 addons:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,6 +69,7 @@ option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_INFERENCE    "Compile fluid inference library"              ON)
+option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
@@ -213,9 +214,11 @@ include(configure)          # add paddle env configuration
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
+endif()
+if(WITH_MKL OR WITH_MKLML)
    include(external/anakin)
 elseif()
-    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
+    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
 endif()
 include(generic)            # simplify cmake module

--- a/Dockerfile
+++ b/Dockerfile
@@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh
 #    and its size is only one-third of the official one.
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
    tar -xz -C /usr/local && \
    cp -rf /usr/local/TensorRT/include /usr && \
    cp -rf /usr/local/TensorRT/lib /usr

--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
-### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0)
+### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -27,9 +27,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==0.14.0.post87
+pip install paddlepaddle-gpu==0.15.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==0.14.0.post85
+pip install paddlepaddle-gpu==0.15.0.post85
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85
 ## Installation
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website.
 ## Documentation
-We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and
+We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation.
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation.
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
  You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html)
  You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html)
   Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
   We appreciate your contributions!

--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -11,6 +11,7 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
 # exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
 RUN pip install -U pip
 RUN pip install -U kubernetes paddlepaddle
@@ -27,5 +28,6 @@ ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl 
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
+ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
 ADD models/ /workspace/models/
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -17,7 +17,8 @@ import argparse
 __all__ = ['parse_args', ]
 BENCHMARK_MODELS = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "se_resnext", "vgg", "mnist",
+    "stacked_dynamic_lstm", "resnet_with_preprocess"
 ]
@@ -67,12 +68,12 @@ def parse_args():
        '--cpus',
        type=int,
        default=1,
-        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+        help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
    parser.add_argument(
        '--data_set',
        type=str,
        default='flowers',
-        choices=['cifar10', 'flowers'],
+        choices=['cifar10', 'flowers', 'imagenet'],
        help='Optional dataset for benchmark.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
@@ -122,6 +123,11 @@ def parse_args():
        type=str,
        default="",
        help='Directory that contains all the training recordio files.')
+    parser.add_argument(
+        '--test_data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the test data (NOT recordio).')
    parser.add_argument(
        '--use_inference_transpiler',
        action='store_true',
@@ -130,5 +136,11 @@ def parse_args():
        '--no_random',
        action='store_true',
        help='If set, keep the random seed and do not shuffle the data.')
+    parser.add_argument(
+        '--reduce_strategy',
+        type=str,
+        choices=['reduce', 'all_reduce'],
+        default='all_reduce',
+        help='Specify the reduce strategy, can be reduce, all_reduce')
    args = parser.parse_args()
    return args
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -16,6 +16,7 @@ import argparse
 import cProfile
 import time
 import os
+import traceback
 import numpy as np
@@ -27,7 +28,7 @@ import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
 from args import *
-def append_nccl2_prepare(trainer_id):
+def append_nccl2_prepare(trainer_id, startup_prog):
    if trainer_id >= 0:
        # append gen_nccl_id at the end of startup program
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
@@ -40,11 +41,11 @@ def append_nccl2_prepare(trainer_id):
        current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
        worker_endpoints.remove(current_endpoint)
-        nccl_id_var = fluid.default_startup_program().global_block().create_var(
+        nccl_id_var = startup_prog.global_block().create_var(
            name="NCCLID",
            persistable=True,
            type=fluid.core.VarDesc.VarType.RAW)
-        fluid.default_startup_program().global_block().append_op(
+        startup_prog.global_block().append_op(
            type="gen_nccl_id",
            inputs={},
            outputs={"NCCLID": nccl_id_var},
@@ -59,7 +60,7 @@ def append_nccl2_prepare(trainer_id):
                        "nccl-based dist train.")
-def dist_transpile(trainer_id, args):
+def dist_transpile(trainer_id, args, train_prog, startup_prog):
    if trainer_id < 0:
        return None, None
@@ -80,133 +81,70 @@ def dist_transpile(trainer_id, args):
    # the role, should be either PSERVER or TRAINER
    training_role = os.getenv("PADDLE_TRAINING_ROLE")
-    t = distribute_transpiler.DistributeTranspiler()
+    config = distribute_transpiler.DistributeTranspilerConfig()
+    config.slice_var_up = not args.no_split_var
+    t = distribute_transpiler.DistributeTranspiler(config=config)
    t.transpile(
        trainer_id,
+        # NOTE: *MUST* use train_prog, for we are using with guard to
+        # generate different program for train and test.
+        program=train_prog,
        pservers=pserver_endpoints,
        trainers=trainers,
-        sync_mode=not args.async_mode)
+        sync_mode=not args.async_mode,
+        startup_program=startup_prog)
    if training_role == "PSERVER":
        pserver_program = t.get_pserver_program(current_endpoint)
-        pserver_startup_program = t.get_startup_program(current_endpoint,
+        pserver_startup_program = t.get_startup_program(
-                                                        pserver_program)
+            current_endpoint, pserver_program, startup_program=startup_prog)
        return pserver_program, pserver_startup_program
    elif training_role == "TRAINER":
        train_program = t.get_trainer_program()
-        return train_program, fluid.default_startup_program()
+        return train_program, startup_prog
    else:
        raise ValueError(
            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
        )
-def test(exe, inference_program, test_reader, feeder, batch_acc):
+def test_parallel(exe, test_args, args, test_prog, feeder):
-    accuracy_evaluator = fluid.metrics.Accuracy()
+    acc_evaluators = []
-    for batch_id, data in enumerate(test_reader()):
+    for i in xrange(len(test_args[2])):
-        acc = exe.run(inference_program,
+        acc_evaluators.append(fluid.metrics.Accuracy())
-                      feed=feeder.feed(data),
-                      fetch_list=[batch_acc])
-        accuracy_evaluator.update(value=np.array(acc), weight=len(data))
-    return accuracy_evaluator.eval()
+    to_fetch = [v.name for v in test_args[2]]
+    if args.use_reader_op:
+        test_args[4].start()
-# TODO(wuyi): replace train, train_parallel, test functions with new trainer
-# API once it is ready.
-def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
-          args, train_prog, startup_prog):
-    if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
-        place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-        exe.run(train_prog)
-        return
-    if args.use_fake_data:
-        raise Exception(
-            "fake data is not supported in single GPU test for now.")
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(startup_prog)
-    # Use inference_transpiler to speedup
-    if not args.use_reader_op:
-        feed_var_list = [
-            var for var in train_prog.global_block().vars.itervalues()
-            if var.is_data
-        ]
-        feeder = fluid.DataFeeder(feed_var_list, place)
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        train_losses = []
-        if not args.use_reader_op:
-            reader_generator = train_reader()
-        batch_id = 0
-        data = None
        while True:
-            if not args.use_reader_op:
+            try:
-                data = next(reader_generator, None)
+                acc_rets = exe.run(fetch_list=to_fetch)
-                if data == None:
+                for i, e in enumerate(acc_evaluators):
-                    break
+                    e.update(
-            if iters == args.iterations:
+                        value=np.array(acc_rets[i]), weight=args.batch_size)
-                reader_generator.close()
+            except fluid.core.EOFException as eof:
+                test_args[4].reset()
                break
-            if iters == args.skip_batch_num:
+    else:
-                start_time = time.time()
+        for batch_id, data in enumerate(test_args[3]()):
-                num_samples = 0
+            acc_rets = exe.run(feed=feeder.feed(data), fetch_list=to_fetch)
+            for i, e in enumerate(acc_evaluators):
+                e.update(value=np.array(acc_rets[i]), weight=len(data))
-            if args.use_reader_op:
+    return [e.eval() for e in acc_evaluators]
-                try:
-                    loss = exe.run(train_prog, fetch_list=[avg_loss])
-                except fluid.core.EnforceNotMet as ex:
-                    break
-            else:
-                loss = exe.run(train_prog,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_loss])
-            iters += 1
-            batch_id += 1
-            # FIXME(wuyi): For use_reader_op, if the current
-            # pass is not the last, the last batch of this pass
-            # is also equal to args.batch_size.
-            if args.use_reader_op:
-                num_samples += args.batch_size * args.gpus
-            else:
-                num_samples += len(data)
-            train_losses.append(loss)
-            print("Pass: %d, Iter: %d, Loss: %f\n" %
-                  (pass_id, iters, np.mean(train_losses)))
-        print_train_time(start_time, time.time(), num_samples)
-        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
-        # evaluation
-        if not args.no_test and batch_acc and not args.use_reader_op:
-            if args.use_inference_transpiler:
-                t = fluid.InferenceTranspiler()
-                t.transpile(infer_prog, place)
-            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
-                                 batch_acc)
-            print(", Test Accuracy: %f" % pass_test_acc)
-        print("\n")
-        # TODO(wuyi): add warmup passes to get better perf data.
-        exit(0)
-# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# NOTE: only need to benchmark using parallelexe
-# API once it is ready.
+def train_parallel(train_args, test_args, args, train_prog, test_prog,
-def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
+                   startup_prog, nccl_id_var, num_trainers, trainer_id):
-                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
+    over_all_start = time.time()
-                   num_trainers, trainer_id):
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    feeder = None
    if not args.use_reader_op:
        feed_var_list = [
            var for var in train_prog.global_block().vars.itervalues()
            if var.is_data
        ]
        feeder = fluid.DataFeeder(feed_var_list, place)
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
@@ -230,63 +168,119 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
    startup_exe = fluid.Executor(place)
    startup_exe.run(startup_prog)
    strategy = fluid.ExecutionStrategy()
-    strategy.num_threads = 1
+    strategy.num_threads = args.cpus
    strategy.allow_op_delay = False
+    build_strategy = fluid.BuildStrategy()
+    if args.reduce_strategy == "reduce":
+        build_strategy.reduce_strategy = fluid.BuildStrategy(
+        ).ReduceStrategy.Reduce
+    else:
+        build_strategy.reduce_strategy = fluid.BuildStrategy(
+        ).ReduceStrategy.AllReduce
+    avg_loss = train_args[0]
+    if args.update_method == "pserver":
+        # parameter server mode distributed training, merge
+        # gradients on local server, do not initialize
+        # ParallelExecutor with multi server all-reduce mode.
+        num_trainers = 1
+        trainer_id = 0
    exe = fluid.ParallelExecutor(
        True,
        avg_loss.name,
+        main_program=train_prog,
        exec_strategy=strategy,
+        build_strategy=build_strategy,
        num_trainers=num_trainers,
        trainer_id=trainer_id)
+    if not args.no_test:
+        if args.update_method == "pserver":
+            test_scope = None
+        else:
+            # NOTE: use an empty scope to avoid test exe using NCCLID
+            test_scope = fluid.Scope()
+        test_exe = fluid.ParallelExecutor(
+            True, main_program=test_prog, share_vars_from=exe)
    for pass_id in range(args.pass_num):
        num_samples = 0
        iters = 0
        start_time = time.time()
        if not args.use_reader_op:
-            reader_generator = train_reader()
+            reader_generator = train_args[3]()  #train_reader
        batch_id = 0
        data = None
+        if args.use_reader_op:
+            train_args[4].start()
        while True:
            if not args.use_reader_op:
                data = next(reader_generator, None)
                if data == None:
                    break
+            if args.profile and batch_id == 5:
+                profiler.start_profiler("All")
+                profiler.reset_profiler()
+            elif args.profile and batch_id == 10:
+                print("profiling total time: ", time.time() - start_time)
+                profiler.stop_profiler("total", "/tmp/profile_%d_pass%d" %
+                                       (trainer_id, pass_id))
            if iters == args.iterations:
                reader_generator.close()
                break
-            if args.profile and pass_id == 0 and batch_id == 5:
-                profiler.start_profiler("All")
-            elif args.profile and pass_id == 0 and batch_id == 10:
-                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
+            fetch_list = [avg_loss.name]
+            acc_name_list = [v.name for v in train_args[2]]
+            fetch_list.extend(acc_name_list)
            if args.use_fake_data or args.use_reader_op:
                try:
-                    loss, = exe.run([avg_loss.name])
+                    fetch_ret = exe.run(fetch_list)
+                except fluid.core.EOFException as eof:
+                    break
                except fluid.core.EnforceNotMet as ex:
+                    traceback.print_exc()
                    break
            else:
-                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+                fetch_ret = exe.run(fetch_list, feed=feeder.feed(data))
            if args.use_reader_op:
                num_samples += args.batch_size * args.gpus
            else:
                num_samples += len(data)
            iters += 1
            if batch_id % 1 == 0:
-                print("Pass %d, batch %d, loss %s" %
+                fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
-                      (pass_id, batch_id, np.array(loss)))
+                print("Pass %d, batch %d, loss %s, accucacys: %s" %
+                      (pass_id, batch_id, fetched_data[0], fetched_data[1:]))
            batch_id += 1
        print_train_time(start_time, time.time(), num_samples)
-        if not args.no_test and batch_acc and not args.use_reader_op:
+        if args.use_reader_op:
-            # we have not implement record io for test
+            train_args[4].reset()  # reset reader handle
-            # skip test when use args.use_reader_op
+        else:
-            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
+            del reader_generator
-                            batch_acc)
-            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
+        if not args.no_test and test_args[2]:
+            test_feeder = None
+            if not args.use_reader_op:
+                test_feed_var_list = [
+                    var for var in test_prog.global_block().vars.itervalues()
+                    if var.is_data
+                ]
+                test_feeder = fluid.DataFeeder(test_feed_var_list, place)
+            test_ret = test_parallel(test_exe, test_args, args, test_prog,
+                                     test_feeder)
+            print("Pass: %d, Test Accuracy: %s\n" %
+                  (pass_id, [np.mean(np.array(v)) for v in test_ret]))
+    print("total train time: ", time.time() - over_all_start)
 def print_arguments(args):
@@ -328,44 +322,46 @@ def main():
    if args.use_cprof:
        pr = cProfile.Profile()
        pr.enable()
    model_def = __import__("models.%s" % args.model, fromlist=["models"])
-    train_args = list(model_def.get_model(args))
-    train_args.append(args)
+    train_prog = fluid.Program()
-    # Run optimizer.minimize(avg_loss)
+    test_prog = fluid.Program()
-    train_args[2].minimize(train_args[0])
+    startup_prog = fluid.Program()
-    if args.memory_optimize:
-        fluid.memory_optimize(fluid.default_main_program())
+    train_args = list(model_def.get_model(args, True, train_prog, startup_prog))
+    test_args = list(model_def.get_model(args, False, test_prog, startup_prog))
+    all_args = [train_args, test_args, args]
    if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile(trainer_id, args)
+        train_prog, startup_prog = dist_transpile(trainer_id, args, train_prog,
+                                                  startup_prog)
        if not train_prog:
            raise Exception(
                "Must configure correct environments to run dist train.")
-        train_args.extend([train_prog, startup_prog])
+        all_args.extend([train_prog, test_prog, startup_prog])
        if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
-            train_args.extend([nccl_id_var, num_trainers, trainer_id])
+            all_args.extend([nccl_id_var, num_trainers, trainer_id])
-            train_parallel(*train_args)
+            train_parallel(*all_args)
-        train(*train_args)
+        elif os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
+            # start pserver with Executor
+            server_exe = fluid.Executor(fluid.CPUPlace())
+            server_exe.run(startup_prog)
+            server_exe.run(train_prog)
        exit(0)
    # for other update methods, use default programs
-    train_args.append(fluid.default_main_program())
+    all_args.extend([train_prog, test_prog, startup_prog])
-    train_args.append(fluid.default_startup_program())
    if args.update_method == "nccl2":
-        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
+        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(
-    if args.gpus == 1:
+            trainer_id, startup_prog)
-        # NOTE: parallel executor use profiler interanlly
-        if args.use_nvprof and args.device == 'GPU':
+    if args.device == "CPU":
-            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+        raise Exception("Only support GPU perf with parallel exe")
-                train(*train_args)
+    all_args.extend([nccl_id_var, num_trainers, trainer_id])
-        else:
+    train_parallel(*all_args)
-            train(*train_args)
-    else:
-        if args.device == "CPU":
-            raise Exception("Only support GPU perf with parallel exe")
-        train_args.extend([nccl_id_var, num_trainers, trainer_id])
-        train_parallel(*train_args)
 if __name__ == "__main__":

--- a/benchmark/fluid/imagenet_reader.py
+++ b/benchmark/fluid/imagenet_reader.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import math
+import random
+import functools
+import numpy as np
+from threading import Thread
+import subprocess
+import time
+from Queue import Queue
+import paddle
+from PIL import Image, ImageEnhance
+random.seed(0)
+DATA_DIM = 224
+THREAD = int(os.getenv("PREPROCESS_THREADS", "10"))
+BUF_SIZE = 5120
+DATA_DIR = '/mnt/ImageNet'
+TRAIN_LIST = '/mnt/ImageNet/train.txt'
+TEST_LIST = '/mnt/ImageNet/val.txt'
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+def resize_short(img, target_size):
+    percent = float(target_size) / min(img.size[0], img.size[1])
+    resized_width = int(round(img.size[0] * percent))
+    resized_height = int(round(img.size[1] * percent))
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    return img
+def crop_image(img, target_size, center):
+    width, height = img.size
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = random.randint(0, width - size)
+        h_start = random.randint(0, height - size)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img.crop((w_start, h_start, w_end, h_end))
+    return img
+def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
+    aspect_ratio = math.sqrt(random.uniform(*ratio))
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+    bound = min((float(img.size[0]) / img.size[1]) / (w**2),
+                (float(img.size[1]) / img.size[0]) / (h**2))
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+    target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
+                                                             scale_max)
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+    i = random.randint(0, img.size[0] - w)
+    j = random.randint(0, img.size[1] - h)
+    img = img.crop((i, j, i + w, j + h))
+    img = img.resize((size, size), Image.LANCZOS)
+    return img
+def rotate_image(img):
+    angle = random.randint(-10, 10)
+    img = img.rotate(angle)
+    return img
+def distort_color(img):
+    def random_brightness(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Brightness(img).enhance(e)
+    def random_contrast(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Contrast(img).enhance(e)
+    def random_color(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Color(img).enhance(e)
+    ops = [random_brightness, random_contrast, random_color]
+    random.shuffle(ops)
+    img = ops[0](img)
+    img = ops[1](img)
+    img = ops[2](img)
+    return img
+def process_image(sample, mode, color_jitter, rotate):
+    img_path = sample[0]
+    img = Image.open(img_path)
+    if mode == 'train':
+        if rotate: img = rotate_image(img)
+        img = random_crop(img, DATA_DIM)
+    else:
+        img = resize_short(img, target_size=256)
+        img = crop_image(img, target_size=DATA_DIM, center=True)
+    if mode == 'train':
+        if color_jitter:
+            img = distort_color(img)
+        if random.randint(0, 1) == 1:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
+    img -= img_mean
+    img /= img_std
+    if mode == 'train' or mode == 'val':
+        return img, sample[1]
+    elif mode == 'test':
+        return [img]
+class XmapEndSignal():
+    pass
+def xmap_readers(mapper,
+                 reader,
+                 process_num,
+                 buffer_size,
+                 order=False,
+                 print_queue_state=True):
+    end = XmapEndSignal()
+    # define a worker to read samples from reader to in_queue
+    def read_worker(reader, in_queue):
+        for i in reader():
+            in_queue.put(i)
+        in_queue.put(end)
+    # define a worker to read samples from reader to in_queue with order flag
+    def order_read_worker(reader, in_queue, file_queue):
+        in_order = 0
+        for i in reader():
+            in_queue.put((in_order, i))
+            in_order += 1
+        in_queue.put(end)
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue
+    def handle_worker(in_queue, out_queue, mapper):
+        sample = in_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            r = mapper(sample)
+            out_queue.put(r)
+            sample = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue by order
+    def order_handle_worker(in_queue, out_queue, mapper, out_order):
+        ins = in_queue.get()
+        while not isinstance(ins, XmapEndSignal):
+            order, sample = ins
+            r = mapper(sample)
+            while order != out_order[0]:
+                pass
+            out_queue.put(r)
+            out_order[0] += 1
+            ins = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+    def xreader():
+        file_queue = Queue()
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+        sample = out_queue.get()
+        start_t = time.time()
+        while not isinstance(sample, XmapEndSignal):
+            yield sample
+            sample = out_queue.get()
+            if time.time() - start_t > 3:
+                if print_queue_state:
+                    print("queue sizes: ", in_queue.qsize(), out_queue.qsize())
+                start_t = time.time()
+        finish = 1
+        while finish < process_num:
+            sample = out_queue.get()
+            if isinstance(sample, XmapEndSignal):
+                finish += 1
+            else:
+                yield sample
+    return xreader
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False,
+                    xmap=True):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(full_lines)
+            if mode == 'train':
+                trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+                trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+                per_node_lines = len(full_lines) / trainer_count
+                lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
+                                   * per_node_lines]
+                print(
+                    "read images from %d, length: %d, lines length: %d, total: %d"
+                    % (trainer_id * per_node_lines, per_node_lines, len(lines),
+                       len(full_lines)))
+            else:
+                lines = full_lines
+            for line in lines:
+                if mode == 'train':
+                    img_path, label = line.split()
+                    img_path = img_path.replace("JPEG", "jpeg")
+                    img_path = os.path.join(DATA_DIR, "train", img_path)
+                    yield (img_path, int(label))
+                elif mode == 'val':
+                    img_path, label = line.split()
+                    img_path = img_path.replace("JPEG", "jpeg")
+                    img_path = os.path.join(DATA_DIR, "val", img_path)
+                    yield (img_path, int(label))
+                elif mode == 'test':
+                    img_path = os.path.join(DATA_DIR, line)
+                    yield [img_path]
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
+def load_raw_image_uint8(sample):
+    img_arr = np.array(Image.open(sample[0])).astype('int64')
+    return img_arr, int(sample[1])
+def train_raw(file_list=TRAIN_LIST, shuffle=True):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(full_lines)
+            trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+            trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+            per_node_lines = len(full_lines) / trainer_count
+            lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1) *
+                               per_node_lines]
+            print("read images from %d, length: %d, lines length: %d, total: %d"
+                  % (trainer_id * per_node_lines, per_node_lines, len(lines),
+                     len(full_lines)))
+            for line in lines:
+                img_path, label = line.split()
+                img_path = img_path.replace("JPEG", "jpeg")
+                img_path = os.path.join(DATA_DIR, "train", img_path)
+                yield (img_path, int(label))
+    return paddle.reader.xmap_readers(load_raw_image_uint8, reader, THREAD,
+                                      BUF_SIZE)
+def train(file_list=TRAIN_LIST, xmap=True):
+    return _reader_creator(
+        file_list,
+        'train',
+        shuffle=True,
+        color_jitter=False,
+        rotate=False,
+        xmap=xmap)
+def val(file_list=TEST_LIST, xmap=True):
+    return _reader_creator(file_list, 'val', shuffle=False, xmap=xmap)
+def test(file_list=TEST_LIST):
+    return _reader_creator(file_list, 'test', shuffle=False)
+if __name__ == "__main__":
+    c = 0
+    start_t = time.time()
+    for d in train()():
+        c += 1
+        if c >= 10000:
+            break
+    spent = time.time() - start_t
+    print("read 10000 speed: ", 10000 / spent, spent)
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -163,6 +163,19 @@ def gen_job():
        volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
        volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
+    # add ceph volumes
+    volumes.append({
+        "name": "ceph-data",
+        "cephfs": {
+            "monitors": ["192.168.16.23:6789"],
+            "secretRef": {
+                "name": "ceph-secret"
+            },
+            "user": "admin",
+        }
+    })
+    volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
    tn["spec"]["template"]["spec"]["volumes"] = volumes
    tn_container["volumeMounts"] = volumeMounts

--- a/benchmark/fluid/models/__init__.py
+++ b/benchmark/fluid/models/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.
 __all__ = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
+    "resnet_with_preprocess"
 ]
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """seq2seq model for fluid."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -181,7 +182,7 @@ def lodtensor_to_ndarray(lod_tensor):
    return ndarray
-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
    if args.use_reader_op:
        raise Exception("machine_translation do not support reader op for now.")
    embedding_dim = 512
@@ -190,30 +191,27 @@ def get_model(args):
    dict_size = 30000
    beam_size = 3
    max_length = 250
-    avg_cost, feeding_list = seq_to_seq_net(
-        embedding_dim,
-        encoder_size,
-        decoder_size,
-        dict_size,
-        dict_size,
-        False,
-        beam_size=beam_size,
-        max_length=max_length)
-    # clone from default main program
-    inference_program = fluid.default_main_program().clone()
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    train_batch_generator = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=args.batch_size * args.gpus)
-    test_batch_generator = paddle.batch(
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            avg_cost, feeding_list = seq_to_seq_net(
+                embedding_dim,
+                encoder_size,
+                decoder_size,
+                dict_size,
+                dict_size,
+                False,
+                beam_size=beam_size,
+                max_length=max_length)
+    if is_train:
+        optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+        optimizer.minimize(avg_cost)
+    batch_generator = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.wmt14.test(dict_size), buf_size=1000),
+            paddle.dataset.wmt14.train(dict_size)
-        batch_size=args.batch_size)
+            if is_train else paddle.dataset.wmt14.test(dict_size),
+            buf_size=1000),
+        batch_size=args.batch_size * args.gpus)
-    return avg_cost, inference_program, optimizer, train_batch_generator, \
+    return avg_cost, optimizer, [], batch_generator, None
-           test_batch_generator, None
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -65,61 +65,53 @@ def cnn_model(data):
    return predict
-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
-    if args.use_reader_op:
+    # NOTE: mnist is small, we don't implement data sharding yet.
-        filelist = [
+    opt = None
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+    data_file_handle = None
-        ]
+    with fluid.program_guard(main_prog, startup_prog):
-        data_file = fluid.layers.open_files(
+        if args.use_reader_op:
-            filenames=filelist,
+            filelist = [
-            shapes=[[-1, 1, 28, 28], (-1, 1)],
+                os.path.join(args.data_path, f)
-            lod_levels=[0, 0],
+                for f in os.listdir(args.data_path)
-            dtypes=["float32", "int64"],
+            ]
-            thread_num=args.gpus,
+            data_file_handle = fluid.layers.open_files(
-            pass_num=args.pass_num)
+                filenames=filelist,
-        data_file = fluid.layers.double_buffer(
+                shapes=[[-1, 1, 28, 28], (-1, 1)],
-            fluid.layers.batch(
+                lod_levels=[0, 0],
-                data_file, batch_size=args.batch_size))
+                dtypes=["float32", "int64"],
-        images, label = fluid.layers.read_file(data_file)
+                thread_num=1,
-    else:
+                pass_num=1)
-        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+            data_file = fluid.layers.double_buffer(
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+                fluid.layers.batch(
+                    data_file_handle, batch_size=args.batch_size))
-    if args.device == 'CPU' and args.cpus > 1:
+        with fluid.unique_name.guard():
-        places = fluid.layers.get_places(args.cpus)
+            if args.use_reader_op:
-        pd = fluid.layers.ParallelDo(places)
+                input, label = fluid.layers.read_file(data_file)
-        with pd.do():
+            else:
-            predict = cnn_model(pd.read_input(images))
+                images = fluid.layers.data(
-            label = pd.read_input(label)
+                    name='pixel', shape=[1, 28, 28], dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+            predict = cnn_model(images)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
+            # Evaluator
            batch_acc = fluid.layers.accuracy(input=predict, label=label)
+            # Optimization
-            pd.write_output(avg_cost)
+            if is_train:
-            pd.write_output(batch_acc)
+                opt = fluid.optimizer.AdamOptimizer(
+                    learning_rate=0.001, beta1=0.9, beta2=0.999)
-        avg_cost, batch_acc = pd()
+                opt.minimize(avg_cost)
-        avg_cost = fluid.layers.mean(avg_cost)
+                if args.memory_optimize:
-        batch_acc = fluid.layers.mean(batch_acc)
+                    fluid.memory_optimize(main_prog)
-    else:
-        # Train program
-        predict = cnn_model(images)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-        # Evaluator
-        batch_acc = fluid.layers.accuracy(input=predict, label=label)
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    # Optimization
-    opt = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.001, beta1=0.9, beta2=0.999)
    # Reader
-    train_reader = paddle.batch(
+    if is_train:
-        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
+        reader = paddle.dataset.mnist.train()
-    test_reader = paddle.batch(
+    else:
-        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+        reader = paddle.dataset.mnist.test()
-    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
+    batched_reader = paddle.batch(
+        reader, batch_size=args.batch_size * args.gpus)
+    return avg_cost, opt, [batch_acc], batched_reader, data_file_handle
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -20,6 +20,7 @@ import functools
 import numpy as np
 import time
 import os
+import math
 import cProfile, pstats, StringIO
@@ -27,182 +28,210 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
-from recordio_converter import imagenet_train, imagenet_test
+from imagenet_reader import train, val
+train_parameters = {
-def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    "input_size": [3, 224, 224],
-    conv1 = fluid.layers.conv2d(
+    "input_mean": [0.485, 0.456, 0.406],
-        input=input,
+    "input_std": [0.229, 0.224, 0.225],
-        filter_size=filter_size,
+    "learning_strategy": {
-        num_filters=ch_out,
+        "name": "piecewise_decay",
-        stride=stride,
+        "batch_size": 256,
-        padding=padding,
+        "epochs": [30, 60, 90],
-        act=None,
+        "steps": [0.1, 0.01, 0.001, 0.0001]
-        bias_attr=False)
+    }
-    return fluid.layers.batch_norm(input=conv1, act=act)
+}
-def shortcut(input, ch_out, stride):
+class ResNet():
-    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
+    def __init__(self, layers=50, is_train=True):
-    if ch_in != ch_out:
+        self.params = train_parameters
-        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        self.layers = layers
-    else:
+        self.is_train = is_train
-        return input
+    def net(self, input, class_dim=1000):
+        layers = self.layers
-def basicblock(input, ch_out, stride):
+        supported_layers = [50, 101, 152]
-    short = shortcut(input, ch_out, stride)
+        assert layers in supported_layers, \
-    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
-    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
-def bottleneck(input, ch_out, stride):
+            depth = [3, 4, 23, 3]
-    short = shortcut(input, ch_out * 4, stride)
+        elif layers == 152:
-    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+            depth = [3, 8, 36, 3]
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+        num_filters = [64, 128, 256, 512]
-    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
-    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+        conv = self.conv_bn_layer(
+            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1)
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(input=pool,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(
+            input=conv, act=act, is_test=not self.is_train)
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            return self.conv_bn_layer(input, ch_out, 1, stride)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters, stride):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
-def layer_warp(block_func, input, ch_out, count, stride):
+        short = self.shortcut(input, num_filters * 4, stride)
-    res_out = block_func(input, ch_out, stride)
-    for i in range(1, count):
-        res_out = block_func(res_out, ch_out, 1)
-    return res_out
+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
-    cfg = {
+def _model_reader_dshape_classdim(args, is_train):
-        18: ([2, 2, 2, 1], basicblock),
+    model = None
-        34: ([3, 4, 6, 3], basicblock),
+    reader = None
-        50: ([3, 4, 6, 3], bottleneck),
+    if args.data_set == "flowers":
-        101: ([3, 4, 23, 3], bottleneck),
-        152: ([3, 8, 36, 3], bottleneck)
-    }
-    stages, block_func = cfg[depth]
-    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
-    pool1 = fluid.layers.pool2d(
-        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
-    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
-    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
-    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
-    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
-    pool2 = fluid.layers.pool2d(
-        input=res4,
-        pool_size=7,
-        pool_type='avg',
-        pool_stride=1,
-        global_pooling=True)
-    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
-    return out
-def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) // 6
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
-    return out
-def get_model(args):
-    model = resnet_cifar10
-    if args.data_set == "cifar10":
-        class_dim = 10
-        if args.data_format == 'NCHW':
-            dshape = [3, 32, 32]
-        else:
-            dshape = [32, 32, 3]
-        model = resnet_cifar10
-        train_reader = paddle.dataset.cifar.train10()
-        test_reader = paddle.dataset.cifar.test10()
-    elif args.data_set == "flowers":
        class_dim = 102
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]
-        model = resnet_imagenet
+        if is_train:
-        train_reader = paddle.dataset.flowers.train()
+            reader = paddle.dataset.flowers.train()
-        test_reader = paddle.dataset.flowers.test()
+        else:
+            reader = paddle.dataset.flowers.test()
    elif args.data_set == "imagenet":
        class_dim = 1000
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]
-        model = resnet_imagenet
        if not args.data_path:
            raise Exception(
                "Must specify --data_path when training with imagenet")
-        train_reader = imagenet_train(args.data_path)
+        if not args.use_reader_op:
-        test_reader = imagenet_test(args.data_path)
+            if is_train:
+                reader = train()
-    if args.use_reader_op:
+            else:
-        filelist = [
+                reader = val()
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        else:
-        ]
+            if is_train:
-        data_file = fluid.layers.open_files(
+                reader = train(xmap=False)
-            filenames=filelist,
+            else:
-            shapes=[[-1] + dshape, (-1, 1)],
+                reader = val(xmap=False)
-            lod_levels=[0, 0],
+    return reader, dshape, class_dim
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
+def get_model(args, is_train, main_prog, startup_prog):
-        data_file = fluid.layers.double_buffer(
+    reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train)
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
+    pyreader = None
-        input, label = fluid.layers.read_file(data_file)
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
-    else:
+    with fluid.program_guard(main_prog, startup_prog):
-        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        with fluid.unique_name.guard():
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
-    if args.device == 'CPU' and args.cpus > 1:
+                    capacity=args.batch_size * args.gpus,
-        places = fluid.layers.get_places(args.cpus)
+                    shapes=([-1] + dshape, (-1, 1)),
-        pd = fluid.layers.ParallelDo(places)
+                    dtypes=('float32', 'int64'),
-        with pd.do():
+                    name="train_reader" if is_train else "test_reader",
-            predict = model(pd.read_input(input), class_dim)
+                    use_double_buffer=True)
-            label = pd.read_input(label)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+            model = ResNet(is_train=is_train)
+            predict = model.net(input, class_dim=class_dim)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
-            batch_acc = fluid.layers.accuracy(input=predict, label=label)
-            pd.write_output(avg_cost)
-            pd.write_output(batch_acc)
-        avg_cost, batch_acc = pd()
+            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
-        avg_cost = fluid.layers.mean(avg_cost)
+            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
-        batch_acc = fluid.layers.mean(batch_acc)
+            # configure optimize
+            optimizer = None
+            if is_train:
+                total_images = 1281167 / trainer_count
+                step = int(total_images / (args.batch_size * args.gpus) + 1)
+                epochs = [30, 60, 90]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    learning_rate=fluid.layers.piecewise_decay(
+                        boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+    # config readers
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader if args.no_random else paddle.reader.shuffle(
+                reader, buf_size=5120),
+            batch_size=args.batch_size * args.gpus,
+            drop_last=True)
    else:
-        predict = model(input, class_dim)
+        batched_reader = None
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        pyreader.decorate_paddle_reader(
-        avg_cost = fluid.layers.mean(x=cost)
+            paddle.batch(
-        batch_acc = fluid.layers.accuracy(input=predict, label=label)
+                reader if args.no_random else paddle.reader.shuffle(
+                    reader, buf_size=5120),
-    inference_program = fluid.default_main_program().clone()
+                batch_size=args.batch_size))
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
+    return avg_cost, optimizer, [batch_acc1,
-            target_vars=[batch_acc])
+                                 batch_acc5], batched_reader, pyreader
-    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-    batched_train_reader = paddle.batch(
-        train_reader if args.no_random else paddle.reader.shuffle(
-            train_reader, buf_size=5120),
-        batch_size=args.batch_size * args.gpus,
-        drop_last=True)
-    batched_test_reader = paddle.batch(
-        test_reader, batch_size=args.batch_size, drop_last=True)
-    return avg_cost, inference_program, optimizer, batched_train_reader,\
-                   batched_test_reader, batch_acc
--- a/benchmark/fluid/models/resnet_with_preprocess.py
+++ b/benchmark/fluid/models/resnet_with_preprocess.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import functools
+import numpy as np
+import time
+import os
+import cProfile, pstats, StringIO
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+# from recordio_converter import imagenet_train, imagenet_test
+from imagenet_reader import train_raw, val
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  act='relu',
+                  is_train=True):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
+def shortcut(input, ch_out, stride, is_train=True):
+    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(
+            input, ch_out, 1, stride, 0, None, is_train=is_train)
+    else:
+        return input
+def basicblock(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+def bottleneck(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out * 4, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
+    conv3 = conv_bn_layer(
+        conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+def resnet_imagenet(input,
+                    class_dim,
+                    depth=50,
+                    data_format='NCHW',
+                    is_train=True):
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) // 6
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+def _model_reader_dshape_classdim(args, is_train):
+    model = resnet_cifar10
+    reader = None
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+        model = resnet_cifar10
+        if is_train:
+            reader = paddle.dataset.cifar.train10()
+        else:
+            reader = paddle.dataset.cifar.test10()
+    elif args.data_set == "flowers":
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if is_train:
+            reader = paddle.dataset.flowers.train()
+        else:
+            reader = paddle.dataset.flowers.test()
+    elif args.data_set == "imagenet":
+        class_dim = 1000
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if not args.data_path:
+            raise Exception(
+                "Must specify --data_path when training with imagenet")
+        if not args.use_reader_op:
+            if is_train:
+                reader = train_raw()
+            else:
+                reader = val()
+        else:
+            if is_train:
+                reader = train_raw()
+            else:
+                reader = val(xmap=False)
+    return model, reader, dshape, class_dim
+def get_model(args, is_train, main_prog, startup_prog):
+    model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
+                                                                     is_train)
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=args.batch_size * args.gpus,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('uint8', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='uint8')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+            # add imagenet preprocessors
+            random_crop = fluid.layers.random_crop(input, dshape)
+            casted = fluid.layers.cast(random_crop, 'float32')
+            # input is HWC
+            trans = fluid.layers.transpose(casted, [0, 3, 1, 2]) / 255.0
+            img_mean = fluid.layers.tensor.assign(
+                np.array([0.485, 0.456, 0.406]).astype('float32').reshape((3, 1,
+                                                                           1)))
+            img_std = fluid.layers.tensor.assign(
+                np.array([0.229, 0.224, 0.225]).astype('float32').reshape((3, 1,
+                                                                           1)))
+            h1 = fluid.layers.elementwise_sub(trans, img_mean, axis=1)
+            h2 = fluid.layers.elementwise_div(h1, img_std, axis=1)
+            # pre_out = (trans - img_mean) / img_std
+            predict = model(h2, class_dim, is_train=is_train)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
+            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
+            # configure optimize
+            optimizer = None
+            if is_train:
+                total_images = 1281167 / trainer_count
+                step = int(total_images / args.batch_size + 1)
+                epochs = [30, 60, 80, 90]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    learning_rate=base_lr,
+                    #learning_rate=fluid.layers.piecewise_decay(
+                    #    boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+    # config readers
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader if args.no_random else paddle.reader.shuffle(
+                reader, buf_size=5120),
+            batch_size=args.batch_size * args.gpus,
+            drop_last=True)
+    else:
+        batched_reader = None
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                # reader if args.no_random else paddle.reader.shuffle(
+                #     reader, buf_size=5120),
+                reader,
+                batch_size=args.batch_size))
+    return avg_cost, optimizer, [batch_acc1,
+                                 batch_acc5], batched_reader, pyreader
--- a/benchmark/fluid/models/se_resnext.py
+++ b/benchmark/fluid/models/se_resnext.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.fluid as fluid
+import math
+import os
+from imagenet_reader import train, val
+__all__ = [
+    "SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d",
+    "SE_ResNeXt152_32x4d", "get_model"
+]
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+class SE_ResNeXt():
+    def __init__(self, layers=50, is_train=True):
+        self.params = train_parameters
+        self.layers = layers
+        self.is_train = is_train
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu')
+            conv = self.conv_bn_layer(
+                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio)
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(input=drop,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(input, ch_out, filter_size, stride)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters, stride, cardinality,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio)
+        short = self.shortcut(input, num_filters * 2, stride)
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(
+            input=conv, act=act, is_test=not self.is_train)
+    def squeeze_excitation(self, input, num_channels, reduction_ratio):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(input=pool,
+                                  size=num_channels / reduction_ratio,
+                                  act='relu',
+                                  param_attr=fluid.param_attr.ParamAttr(
+                                      initializer=fluid.initializer.Uniform(
+                                          -stdv, stdv)))
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(input=squeeze,
+                                     size=num_channels,
+                                     act='sigmoid',
+                                     param_attr=fluid.param_attr.ParamAttr(
+                                         initializer=fluid.initializer.Uniform(
+                                             -stdv, stdv)))
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+def SE_ResNeXt50_32x4d():
+    model = SE_ResNeXt(layers=50)
+    return model
+def SE_ResNeXt101_32x4d():
+    model = SE_ResNeXt(layers=101)
+    return model
+def SE_ResNeXt152_32x4d():
+    model = SE_ResNeXt(layers=152)
+    return model
+def get_model(args, is_train, main_prog, startup_prog):
+    model = SE_ResNeXt(layers=50)
+    batched_reader = None
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    dshape = train_parameters["input_size"]
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=10,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('float32', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+            out = model.net(input=input)
+            cost = fluid.layers.cross_entropy(input=out, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+            acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+            optimizer = None
+            if is_train:
+                total_images = 1281167 / trainer_count
+                step = int(total_images / args.batch_size + 1)
+                epochs = [40, 80, 100]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    # learning_rate=base_lr,
+                    learning_rate=fluid.layers.piecewise_decay(
+                        boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+    # config readers
+    if is_train:
+        reader = train()
+    else:
+        reader = val()
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader, batch_size=args.batch_size * args.gpus, drop_last=True)
+    else:
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                reader, batch_size=args.batch_size))
+    return avg_cost, optimizer, [acc_top1, acc_top5], batched_reader, pyreader
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -26,7 +26,6 @@ import numpy
 import paddle
 import paddle.dataset.imdb as imdb
 import paddle.fluid as fluid
-import paddle.batch as batch
 import paddle.fluid.profiler as profiler
 word_dict = imdb.word_dict()
@@ -43,19 +42,7 @@ def crop_sentence(reader, crop_size):
    return __impl__
-def get_model(args):
+def lstm_net(sentence, lstm_size):
-    if args.use_reader_op:
-        raise Exception(
-            "stacked_dynamic_lstm do not support reader op for now.")
-    lstm_size = 512
-    emb_dim = 512
-    crop_size = 1500
-    data = fluid.layers.data(
-        name="words", shape=[1], lod_level=1, dtype='int64')
-    sentence = fluid.layers.embedding(
-        input=data, size=[len(word_dict), emb_dim])
    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
    rnn = fluid.layers.DynamicRNN()
@@ -97,31 +84,47 @@ def get_model(args):
    last = fluid.layers.sequence_pool(rnn(), 'last')
    logit = fluid.layers.fc(input=last, size=2, act='softmax')
-    loss = fluid.layers.cross_entropy(
+    return logit
-        input=logit,
-        label=fluid.layers.data(
-            name='label', shape=[1], dtype='int64'))
-    loss = fluid.layers.mean(x=loss)
-    # add acc
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
-                shape=[1], dtype='int64'), total=batch_size_tensor)
-    inference_program = fluid.default_main_program().clone()
+def get_model(args, is_train, main_prog, startup_prog):
-    with fluid.program_guard(inference_program):
+    if args.use_reader_op:
-        inference_program = fluid.io.get_inference_program(
+        raise Exception(
-            target_vars=[batch_acc, batch_size_tensor])
+            "stacked_dynamic_lstm do not support reader op for now.")
+    lstm_size = 512
-    adam = fluid.optimizer.Adam()
+    emb_dim = 512
+    crop_size = 1500
-    train_reader = batch(
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            data = fluid.layers.data(
+                name="words", shape=[1], lod_level=1, dtype='int64')
+            sentence = fluid.layers.embedding(
+                input=data, size=[len(word_dict), emb_dim])
+            logit = lstm_net(sentence, lstm_size)
+            loss = fluid.layers.cross_entropy(
+                input=logit,
+                label=fluid.layers.data(
+                    name='label', shape=[1], dtype='int64'))
+            loss = fluid.layers.mean(x=loss)
+            # add acc
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                        shape=[1], dtype='int64'), total=batch_size_tensor)
+            if is_train:
+                adam = fluid.optimizer.Adam()
+                adam.minimize(loss)
+    if is_train:
+        reader = crop_sentence(imdb.train(word_dict), crop_size)
+    else:
+        reader = crop_sentence(imdb.test(word_dict), crop_size)
+    batched_reader = paddle.batch(
        paddle.reader.shuffle(
-            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
+            reader, buf_size=25000),
        batch_size=args.batch_size * args.gpus)
-    test_reader = batch(
-        paddle.reader.shuffle(
-            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
-        batch_size=args.batch_size)
-    return loss, inference_program, adam, train_reader, test_reader, batch_acc
+    return loss, adam, [batch_acc], batched_reader, None
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -25,7 +25,7 @@ import functools
 import os
-def vgg16_bn_drop(input):
+def vgg16_bn_drop(input, is_train=True):
    def conv_block(input, num_filter, groups, dropouts):
        return fluid.nets.img_conv_group(
            input=input,
@@ -46,13 +46,13 @@ def vgg16_bn_drop(input):
    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    bn = fluid.layers.batch_norm(input=fc1, act='relu', is_test=not is_train)
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
    return fc2
-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
    if args.data_set == "cifar10":
        classdim = 10
        if args.data_format == 'NCHW':
@@ -65,57 +65,56 @@ def get_model(args):
            data_shape = [3, 224, 224]
        else:
            data_shape = [224, 224, 3]
+    filelist = [
+        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+    ]
+    with fluid.program_guard(main_prog, startup_prog):
+        if args.use_reader_op:
+            data_file_handle = fluid.layers.open_files(
+                filenames=filelist,
+                shapes=[[-1] + data_shape, (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"],
+                thread_num=1,
+                pass_num=1)
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(
+                    data_file_handle, batch_size=args.batch_size))
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                images, label = fluid.layers.read_file(data_file)
+            else:
+                images = fluid.layers.data(
+                    name='data', shape=data_shape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+            # Train program
+            net = vgg16_bn_drop(images, is_train=is_train)
+            predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
-    if args.use_reader_op:
+            # Evaluator
-        filelist = [
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+            batch_acc = fluid.layers.accuracy(
-        ]
+                input=predict, label=label, total=batch_size_tensor)
-        data_file = fluid.layers.open_files(
+            # Optimization
-            filenames=filelist,
+            if is_train:
-            shapes=[[-1] + data_shape, (-1, 1)],
+                optimizer = fluid.optimizer.Adam(
-            lod_levels=[0, 0],
+                    learning_rate=args.learning_rate)
-            dtypes=["float32", "int64"],
+                optimizer.minimize(avg_cost)
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        images, label = fluid.layers.read_file(data_file)
-    else:
-        images = fluid.layers.data(
-            name='data', shape=data_shape, dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
    # data reader
-    train_reader = paddle.batch(
+    if is_train:
+        reader = paddle.dataset.cifar.train10() \
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train()
+    else:
+        reader = paddle.dataset.cifar.test10() \
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test()
+    batched_reader = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
+            reader, buf_size=5120),
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
        batch_size=args.batch_size * args.gpus)
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=args.batch_size)
-    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
+    return avg_cost, optimizer, [batch_acc], batched_reader, data_file_handle
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -16,16 +16,6 @@ set(ANAKIN_LIBRARY     ${ANAKIN_INSTALL_DIR})
 set(ANAKIN_SHARED_LIB  ${ANAKIN_LIBRARY}/libanakin.so)
 set(ANAKIN_SABER_LIB   ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
-# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
-set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
-set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
-set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
-set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
 include_directories(${ANAKIN_INCLUDE})
 include_directories(${ANAKIN_INCLUDE}/saber/)
 include_directories(${ANAKIN_INCLUDE}/saber/core/)
@@ -48,21 +38,24 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
    -Wno-reorder
    -Wno-error=cpp)
+if(WITH_GPU)
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=YES -DCUDNN_ROOT=${CUDNN_ROOT} -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR})
+else()
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=NO)
+endif()
 ExternalProject_Add(
    extern_anakin
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLML_PROJECT}
    GIT_REPOSITORY      "https://github.com/PaddlePaddle/Anakin"
-    GIT_TAG             "9424277cf9ae180a14aff09560d3cd60a49c76d2"
+    GIT_TAG             "3c8554f4978628183566ab7dd6c1e7e66493c7cd"
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
-    CMAKE_ARGS          -DUSE_GPU_PLACE=YES
+    CMAKE_ARGS          ${CMAKE_ARGS_PREFIX}
                        -DUSE_X86_PLACE=YES
                        -DBUILD_WITH_UNIT_TEST=NO
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                        -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-                        -DCUDNN_ROOT=${CUDNN_ROOT}
-                        -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
                        -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
                        ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -29,7 +29,7 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
  MESSAGE(STATUS "use pre defined download url")
-  SET(MKLML_VER "mklml_lnx_2018.0.3.20180406" CACHE STRING "" FORCE)
+  SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
  SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
 ENDIF()
 MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -211,7 +211,7 @@ function(merge_static_libs TARGET_NAME)
      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
      #endif()
    endforeach()
    # windows cmd return error in clean env.
    # COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
@@ -255,7 +255,7 @@ function(cc_library TARGET_NAME)
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
    endif()
    # cpplint code style
    foreach(source_file ${cc_library_SRCS})
      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
@@ -298,11 +298,10 @@ function(cc_test TARGET_NAME)
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    if (${cc_test_SERIAL})
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    endif()
  endif()
 endfunction(cc_test)
@@ -366,11 +365,10 @@ function(nv_test TARGET_NAME)
    add_test(${TARGET_NAME} ${TARGET_NAME})
    if (nv_test_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    endif()
  endif()
 endfunction(nv_test)
@@ -558,26 +556,26 @@ function(paddle_protobuf_generate_cpp SRCS HDRS)
  set(${HDRS})
  if (MOBILE_INFERENCE)
-      set(EXTRA_FLAG "lite:")  
+      set(EXTRA_FLAG "lite:")
  else()
-      set(EXTRA_FLAG "") 
+      set(EXTRA_FLAG "")
  endif()
  foreach(FIL ${ARGN})
    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
    get_filename_component(FIL_WE ${FIL} NAME_WE)
    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
    add_custom_command(
      OUTPUT "${_protobuf_protoc_src}"
             "${_protobuf_protoc_hdr}"
      COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
-      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
      -I${CMAKE_CURRENT_SOURCE_DIR}
      --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
      DEPENDS ${ABS_FIL} protoc
@@ -646,7 +644,7 @@ function(grpc_library TARGET_NAME)
  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
  #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
-  # somehow it didn't. line 602 to 604 is to patching this. Leaving this here 
+  # somehow it didn't. line 602 to 604 is to patching this. Leaving this here
  # for now to enable dist CI.
  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 if (NOT WIN32)
-copy(framework_lib DEPS framework_py_proto 
+set(framework_lib_deps framework_py_proto)
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
+endif(NOT WIN32)
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+copy(framework_lib DEPS ${framework_lib_deps}
-)
-else()
-copy(framework_lib
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+       ${src_dir}/${module}/ir/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
 )
-endif(NOT WIN32)
 set(module "memory")
 copy(memory_lib
@@ -148,12 +145,12 @@ copy(memory_lib
 set(inference_deps paddle_fluid_shared paddle_fluid)
 set(module "inference/api")
-if (WITH_ANAKIN AND WITH_GPU)
+if (WITH_ANAKIN AND WITH_MKL)
    copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
        SRCS
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
        ${ANAKIN_INSTALL_DIR} # anakin release
-        DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
+        DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
     list(APPEND inference_deps anakin_inference_lib)
 endif()
@@ -161,7 +158,8 @@ set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )
 set(module "platform")

--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -16,7 +16,9 @@ find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
    DOC "Path to TensorRT library.")
 if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
+  if(WITH_DSO)
    set(TENSORRT_FOUND ON)
+  endif(WITH_DSO)
 else()
    set(TENSORRT_FOUND OFF)
 endif()

--- a/doc/README.md
+++ b/doc/README.md
+# For Readers and Developers
+Thanks for reading PaddlePaddle documentation. 
+Since **September 17th, 2018**, the **0.15.0 and develop** documentation source has been moved to [FluidDoc Repo](https://github.com/PaddlePaddle/FluidDoc) and updated there.
+Please turn to FluidDoc Repo for the latest documentation.
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
-if(NOT DEFINED SPHINX_THEME)
-    set(SPHINX_THEME default)
-endif()
-if(NOT DEFINED SPHINX_THEME_DIR)
-    set(SPHINX_THEME_DIR)
-endif()
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
-# HTML output director
-set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
-set(IMPORT_PADDLE_STRING "")
-set(IMPORT_PADDLEV2_STRING "")
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
-    "${BINARY_BUILD_DIR_EN}/conf.py"
-    @ONLY)
-sphinx_add_target(paddle_fluid_docs
-                  html
-                  ${BINARY_BUILD_DIR_EN}
-                  ${SPHINX_CACHE_DIR_EN}
-                  ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR_EN})
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
-# HTML output directory
-set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
-    "${BINARY_BUILD_DIR_CN}/conf.py"
-    @ONLY)
-sphinx_add_target(paddle_fluid_docs_cn
-                  html
-                  ${BINARY_BUILD_DIR_CN}
-                  ${SPHINX_CACHE_DIR_CN}
-                  ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR_CN})
-add_subdirectory(api)
--- a/doc/fluid/api/CMakeLists.txt
+++ b/doc/fluid/api/CMakeLists.txt
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
-# HTML output director
-set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
-set(IMPORT_PADDLE_STRING "import paddle")
-set(IMPORT_PADDLEV2_STRING "import paddle.v2")
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
-    "${BINARY_BUILD_DIR_EN}/conf.py"
-    @ONLY)
-sphinx_add_target(paddle_fluid_apis
-                  html
-                  ${BINARY_BUILD_DIR_EN}
-                  ${SPHINX_CACHE_DIR_EN}
-                  ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
--- a/doc/fluid/api/average.rst
+++ b/doc/fluid/api/average.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-=============
-fluid.average
-=============
-.. _api_fluid_average_WeightedAverage:
-WeightedAverage
---------------
-..  autoclass:: paddle.fluid.average.WeightedAverage
-    :members:
-    :noindex:
--- a/doc/fluid/api/backward.rst
+++ b/doc/fluid/api/backward.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-==============
-fluid.backward
-==============
-.. _api_fluid_backward_append_backward:
-append_backward
---------------
-..  autofunction:: paddle.fluid.backward.append_backward
-    :noindex:
-.. _api_fluid_backward_calc_gradient:
-calc_gradient
-------------
-..  autofunction:: paddle.fluid.backward.calc_gradient
-    :noindex:
--- a/doc/fluid/api/clip.rst
+++ b/doc/fluid/api/clip.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-==========
-fluid.clip
-==========
-.. _api_fluid_clip_ErrorClipByValue:
-ErrorClipByValue
----------------
-..  autoclass:: paddle.fluid.clip.ErrorClipByValue
-    :members:
-    :noindex:
-.. _api_fluid_clip_GradientClipByValue:
-GradientClipByValue
-------------------
-..  autoclass:: paddle.fluid.clip.GradientClipByValue
-    :members:
-    :noindex:
-.. _api_fluid_clip_GradientClipByNorm:
-GradientClipByNorm
------------------
-..  autoclass:: paddle.fluid.clip.GradientClipByNorm
-    :members:
-    :noindex:
-.. _api_fluid_clip_GradientClipByGlobalNorm:
-GradientClipByGlobalNorm
------------------------
-..  autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
-    :members:
-    :noindex:
--- a/doc/fluid/api/data/data_reader.rst
+++ b/doc/fluid/api/data/data_reader.rst
-=====================
-Data Reader Interface
-=====================
-DataTypes
-=========
-..  autofunction:: paddle.v2.data_type.dense_array
-    :noindex:
-..  autofunction:: paddle.v2.data_type.integer_value
-    :noindex:
-..  autofunction:: paddle.v2.data_type.integer_value_sequence
-    :noindex:
-..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
-    :noindex:
-..  autofunction:: paddle.v2.data_type.sparse_binary_vector
-    :noindex:
-..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
-    :noindex:
-..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
-    :noindex:
-..  autofunction:: paddle.v2.data_type.sparse_float_vector
-    :noindex:
-..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
-    :noindex:
-..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
-    :noindex:
-..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
-    :noindex:
-..  autofunction:: paddle.v2.data_type.sparse_value_slot
-    :noindex:
-..  autoclass:: paddle.v2.data_type.InputType
-    :members:
-    :noindex:
-DataFeeder
-==========
-..  automodule:: paddle.v2.data_feeder
-    :members:
-    :noindex:
-Reader
-======
-..  automodule:: paddle.reader
-    :members:
-    :noindex:
-..  automodule:: paddle.reader.creator
-    :members:
-    :noindex:
-minibatch
-=========
-..  automodule:: paddle.v2.minibatch
-    :members:
-    :noindex:
--- a/doc/fluid/api/data/dataset.rst
+++ b/doc/fluid/api/data/dataset.rst
-Dataset
-=======
-..  automodule:: paddle.dataset
-    :members:
-    :noindex:
-mnist
-+++++
-..  automodule:: paddle.dataset.mnist
-    :members:
-    :noindex:
-cifar
-+++++
-..  automodule:: paddle.dataset.cifar
-    :members:
-    :noindex:
-conll05
-+++++++
-..  automodule:: paddle.dataset.conll05
-    :members: get_dict,get_embedding,test
-    :noindex:
-imdb
-++++
-..  automodule:: paddle.dataset.imdb
-    :members:
-    :noindex:
-imikolov
-++++++++
-..  automodule:: paddle.dataset.imikolov
-    :members:
-    :noindex:
-movielens
-+++++++++
-..  automodule:: paddle.dataset.movielens
-    :members:
-    :noindex:
-..  autoclass:: paddle.dataset.movielens.MovieInfo
-    :noindex:
-..  autoclass:: paddle.dataset.movielens.UserInfo
-    :noindex:
-sentiment
-+++++++++
-..  automodule:: paddle.dataset.sentiment
-    :members:
-    :noindex:
-uci_housing
-+++++++++++
-..  automodule:: paddle.dataset.uci_housing
-    :members:
-    :noindex:
-wmt14
-+++++
-..  automodule:: paddle.dataset.wmt14
-    :members:
-    :noindex:
-wmt16
-+++++
-..  automodule:: paddle.dataset.wmt16
-    :members:
-    :noindex:
--- a/doc/fluid/api/data/image.rst
+++ b/doc/fluid/api/data/image.rst
-Image Interface
-===============
-..  automodule:: paddle.v2.image
-    :members:
--- a/doc/fluid/api/data_feeder.rst
+++ b/doc/fluid/api/data_feeder.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-=================
-fluid.data_feeder
-=================
-.. _api_fluid_data_feeder_DataFeeder:
-DataFeeder
----------
-..  autoclass:: paddle.fluid.data_feeder.DataFeeder
-    :members:
-    :noindex:
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-==============
-fluid.executor
-==============
-.. _api_fluid_executor_Executor:
-Executor
--------
-..  autoclass:: paddle.fluid.executor.Executor
-    :members:
-    :noindex:
-.. _api_fluid_executor_global_scope:
-global_scope
------------
-..  autofunction:: paddle.fluid.executor.global_scope
-    :noindex:
-.. _api_fluid_executor_scope_guard:
-scope_guard
-----------
-..  autofunction:: paddle.fluid.executor.scope_guard
-    :noindex:
-.. _api_fluid_executor__switch_scope:
-_switch_scope
-------------
-..  autofunction:: paddle.fluid.executor._switch_scope
-    :noindex:
--- a/doc/fluid/api/fluid.rst
+++ b/doc/fluid/api/fluid.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-=====
-fluid
-=====
-.. _api_fluid_Block:
-Block
-----
-..  autoclass:: paddle.fluid.Block
-    :members:
-    :noindex:
-.. _api_fluid_Variable:
-Variable
--------
-..  autoclass:: paddle.fluid.Variable
-    :members:
-    :noindex:
-.. _api_fluid_Program:
-Program
-------
-..  autoclass:: paddle.fluid.Program
-    :members:
-    :noindex:
-.. _api_fluid_Operator:
-Operator
--------
-..  autoclass:: paddle.fluid.Operator
-    :members:
-    :noindex:
-.. _api_fluid_default_startup_program:
-default_startup_program
-----------------------
-..  autofunction:: paddle.fluid.default_startup_program
-    :noindex:
-.. _api_fluid_default_main_program:
-default_main_program
--------------------
-..  autofunction:: paddle.fluid.default_main_program
-    :noindex:
-.. _api_fluid_program_guard:
-program_guard
-------------
-..  autofunction:: paddle.fluid.program_guard
-    :noindex:
-.. _api_fluid_get_var:
-get_var
-------
-..  autofunction:: paddle.fluid.get_var
-    :noindex:
-.. _api_fluid_Executor:
-Executor
--------
-..  autoclass:: paddle.fluid.Executor
-    :members:
-    :noindex:
-.. _api_fluid_global_scope:
-global_scope
------------
-..  autofunction:: paddle.fluid.global_scope
-    :noindex:
-.. _api_fluid_scope_guard:
-scope_guard
-----------
-..  autofunction:: paddle.fluid.scope_guard
-    :noindex:
-.. _api_fluid__switch_scope:
-_switch_scope
-------------
-..  autofunction:: paddle.fluid._switch_scope
-    :noindex:
-.. _api_fluid_make_channel:
-make_channel
------------
-..  autofunction:: paddle.fluid.make_channel
-    :noindex:
-.. _api_fluid_channel_send:
-channel_send
------------
-..  autofunction:: paddle.fluid.channel_send
-    :noindex:
-.. _api_fluid_channel_recv:
-channel_recv
------------
-..  autofunction:: paddle.fluid.channel_recv
-    :noindex:
-.. _api_fluid_channel_close:
-channel_close
-------------
-..  autofunction:: paddle.fluid.channel_close
-    :noindex:
-.. _api_fluid_Select:
-Select
------
-..  autoclass:: paddle.fluid.Select
-    :members:
-    :noindex:
-.. _api_fluid_Trainer:
-Trainer
-------
-..  autoclass:: paddle.fluid.Trainer
-    :members:
-    :noindex:
-.. _api_fluid_BeginEpochEvent:
-BeginEpochEvent
---------------
-..  autoclass:: paddle.fluid.BeginEpochEvent
-    :members:
-    :noindex:
-.. _api_fluid_EndEpochEvent:
-EndEpochEvent
-------------
-..  autoclass:: paddle.fluid.EndEpochEvent
-    :members:
-    :noindex:
-.. _api_fluid_BeginStepEvent:
-BeginStepEvent
--------------
-..  autoclass:: paddle.fluid.BeginStepEvent
-    :members:
-    :noindex:
-.. _api_fluid_EndStepEvent:
-EndStepEvent
------------
-..  autoclass:: paddle.fluid.EndStepEvent
-    :members:
-    :noindex:
-.. _api_fluid_CheckpointConfig:
-CheckpointConfig
----------------
-..  autoclass:: paddle.fluid.CheckpointConfig
-    :members:
-    :noindex:
-.. _api_fluid_Inferencer:
-Inferencer
----------
-..  autoclass:: paddle.fluid.Inferencer
-    :members:
-    :noindex:
-.. _api_fluid_DistributeTranspiler:
-DistributeTranspiler
--------------------
-..  autoclass:: paddle.fluid.DistributeTranspiler
-    :members:
-    :noindex:
-.. _api_fluid_memory_optimize:
-memory_optimize
---------------
-..  autofunction:: paddle.fluid.memory_optimize
-    :noindex:
-.. _api_fluid_release_memory:
-release_memory
--------------
-..  autofunction:: paddle.fluid.release_memory
-    :noindex:
-.. _api_fluid_ParallelExecutor:
-ParallelExecutor
----------------
-..  autoclass:: paddle.fluid.ParallelExecutor
-    :members:
-    :noindex:
-.. _api_fluid_ExecutionStrategy:
-ExecutionStrategy
-----------------
-..  autoclass:: paddle.fluid.ExecutionStrategy
-    :members:
-    :noindex:
-.. _api_fluid_BuildStrategy:
-BuildStrategy
-------------
-..  autoclass:: paddle.fluid.BuildStrategy
-    :members:
-    :noindex:
-.. _api_fluid_create_lod_tensor:
-create_lod_tensor
-----------------
-..  autofunction:: paddle.fluid.create_lod_tensor
-    :noindex:
-.. _api_fluid_create_random_int_lodtensor:
-create_random_int_lodtensor
---------------------------
-..  autofunction:: paddle.fluid.create_random_int_lodtensor
-    :noindex:
-.. _api_fluid_LoDTensor:
-LoDTensor
---------
-..  autoclass:: paddle.fluid.LoDTensor
-    :members:
-    :noindex:
-.. _api_fluid_CPUPlace:
-CPUPlace
--------
-..  autoclass:: paddle.fluid.CPUPlace
-    :members:
-    :noindex:
-.. _api_fluid_CUDAPlace:
-CUDAPlace
---------
-..  autoclass:: paddle.fluid.CUDAPlace
-    :members:
-    :noindex:
-.. _api_fluid_CUDAPinnedPlace:
-CUDAPinnedPlace
---------------
-..  autoclass:: paddle.fluid.CUDAPinnedPlace
-    :members:
-    :noindex:
-.. _api_fluid_Tensor:
-Tensor
------
-..  autoclass:: paddle.fluid.Tensor
-    :members:
-    :noindex:
-.. _api_fluid_ParamAttr:
-ParamAttr
---------
-..  autoclass:: paddle.fluid.ParamAttr
-    :members:
-    :noindex:
-.. _api_fluid_WeightNormParamAttr:
-WeightNormParamAttr
-------------------
-..  autoclass:: paddle.fluid.WeightNormParamAttr
-    :members:
-    :noindex:
-.. _api_fluid_DataFeeder:
-DataFeeder
----------
-..  autoclass:: paddle.fluid.DataFeeder
-    :members:
-    :noindex:
-.. _api_fluid_Scope:
-Scope
-----
-..  autoclass:: paddle.fluid.Scope
-    :members:
-    :noindex:
--- a/doc/fluid/api/gen_doc.py
+++ b/doc/fluid/api/gen_doc.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import argparse
-import sys
-import types
-import paddle.fluid as fluid
-def parse_arg():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--submodules', nargs="*")
-    parser.add_argument(
-        'module', type=str, help='Generate the documentation of which module')
-    return parser.parse_args()
-class DocGenerator(object):
-    def __init__(self, module_name=None, stream=sys.stdout):
-        if module_name == "":
-            module_name = None
-        self.stream = stream
-        if module_name is None:
-            self.module_name = "fluid"
-        else:
-            self.module_name = "fluid." + module_name
-        if module_name is None:
-            self.module = fluid
-        else:
-            if not hasattr(fluid, module_name):
-                raise ValueError("Cannot find fluid.{0}".format(module_name))
-            else:
-                self.module = getattr(fluid, module_name)
-        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-''')
-        self._print_header_(self.module_name, dot='=', is_title=True)
-    def print_submodule(self, submodule_name):
-        submodule = getattr(self.module, submodule_name)
-        if submodule is None:
-            raise ValueError("Cannot find submodule {0}".format(submodule_name))
-        self.print_section(submodule_name)
-        for item in submodule.__all__:
-            self.print_item(item)
-    def print_current_module(self):
-        for item in self.module.__all__:
-            self.print_item(item)
-    def print_section(self, name):
-        self._print_header_(name, dot='=', is_title=False)
-    def print_item(self, name):
-        item = getattr(self.module, name, None)
-        if item is None:
-            return
-        if isinstance(item, types.TypeType):
-            self.print_class(name)
-        elif isinstance(item, types.FunctionType):
-            self.print_method(name)
-        else:
-            pass
-    def print_class(self, name):
-        self._print_ref_(name)
-        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autoclass:: paddle.{0}.{1}
-    :members:
-    :noindex:
-'''.format(self.module_name, name))
-    def print_method(self, name):
-        self._print_ref_(name)
-        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autofunction:: paddle.{0}.{1}
-    :noindex:
-'''.format(self.module_name, name))
-    def _print_header_(self, name, dot, is_title):
-        dot_line = dot * len(name)
-        if is_title:
-            self.stream.write(dot_line)
-            self.stream.write('\n')
-        self.stream.write(name)
-        self.stream.write('\n')
-        self.stream.write(dot_line)
-        self.stream.write('\n')
-        self.stream.write('\n')
-    def _print_ref_(self, name):
-        self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
-            self.module_name.split(".")), name))
-def main():
-    args = parse_arg()
-    gen = DocGenerator(args.module)
-    if args.submodules is None:
-        gen.print_current_module()
-    else:
-        for submodule_name in args.submodules:
-            gen.print_submodule(submodule_name)
-if __name__ == '__main__':
-    main()
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
-#!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst
-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler
-do
-  python gen_doc.py ${module} > ${module}.rst
-done
-python gen_doc.py "" > fluid.rst
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
-=============
-API Reference
-=============
-..  toctree::
-    :maxdepth: 1
-    fluid.rst
-    layers.rst
-    data_feeder.rst
-    executor.rst
-    initializer.rst
-    metrics.rst
-    nets.rst
-    clip.rst
-    optimizer.rst
-    param_attr.rst
-    profiler.rst
-    regularizer.rst
-    io.rst
-    data.rst
-    transpiler.rst
-    recordio_writer.rst
-    backward.rst
-    average.rst
-    profiler.rst
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-=================
-fluid.initializer
-=================
-.. _api_fluid_initializer_Constant:
-Constant
--------
-..  autoclass:: paddle.fluid.initializer.Constant
-    :members:
-    :noindex:
-.. _api_fluid_initializer_Uniform:
-Uniform
-------
-..  autoclass:: paddle.fluid.initializer.Uniform
-    :members:
-    :noindex:
-.. _api_fluid_initializer_Normal:
-Normal
------
-..  autoclass:: paddle.fluid.initializer.Normal
-    :members:
-    :noindex:
-.. _api_fluid_initializer_Xavier:
-Xavier
------
-..  autoclass:: paddle.fluid.initializer.Xavier
-    :members:
-    :noindex:
-.. _api_fluid_initializer_Bilinear:
-Bilinear
--------
-..  autoclass:: paddle.fluid.initializer.Bilinear
-    :members:
-    :noindex:
-.. _api_fluid_initializer_MSRA:
-MSRA
----
-..  autoclass:: paddle.fluid.initializer.MSRA
-    :members:
-    :noindex:
-.. _api_fluid_initializer_force_init_on_cpu:
-force_init_on_cpu
-----------------
-..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
-    :noindex:
-.. _api_fluid_initializer_init_on_cpu:
-init_on_cpu
-----------
-..  autofunction:: paddle.fluid.initializer.init_on_cpu
-    :noindex:
-.. _api_fluid_initializer_ConstantInitializer:
-ConstantInitializer
-------------------
-..  autoclass:: paddle.fluid.initializer.ConstantInitializer
-    :members:
-    :noindex:
-.. _api_fluid_initializer_UniformInitializer:
-UniformInitializer
------------------
-..  autoclass:: paddle.fluid.initializer.UniformInitializer
-    :members:
-    :noindex:
-.. _api_fluid_initializer_NormalInitializer:
-NormalInitializer
-----------------
-..  autoclass:: paddle.fluid.initializer.NormalInitializer
-    :members:
-    :noindex:
-.. _api_fluid_initializer_XavierInitializer:
-XavierInitializer
-----------------
-..  autoclass:: paddle.fluid.initializer.XavierInitializer
-    :members:
-    :noindex:
-.. _api_fluid_initializer_BilinearInitializer:
-BilinearInitializer
-------------------
-..  autoclass:: paddle.fluid.initializer.BilinearInitializer
-    :members:
-    :noindex:
-.. _api_fluid_initializer_MSRAInitializer:
-MSRAInitializer
---------------
-..  autoclass:: paddle.fluid.initializer.MSRAInitializer
-    :members:
-    :noindex:
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-========
-fluid.io
-========
-.. _api_fluid_io_save_vars:
-save_vars
---------
-..  autofunction:: paddle.fluid.io.save_vars
-    :noindex:
-.. _api_fluid_io_save_params:
-save_params
-----------
-..  autofunction:: paddle.fluid.io.save_params
-    :noindex:
-.. _api_fluid_io_save_persistables:
-save_persistables
-----------------
-..  autofunction:: paddle.fluid.io.save_persistables
-    :noindex:
-.. _api_fluid_io_load_vars:
-load_vars
---------
-..  autofunction:: paddle.fluid.io.load_vars
-    :noindex:
-.. _api_fluid_io_load_params:
-load_params
-----------
-..  autofunction:: paddle.fluid.io.load_params
-    :noindex:
-.. _api_fluid_io_load_persistables:
-load_persistables
-----------------
-..  autofunction:: paddle.fluid.io.load_persistables
-    :noindex:
-.. _api_fluid_io_save_inference_model:
-save_inference_model
--------------------
-..  autofunction:: paddle.fluid.io.save_inference_model
-    :noindex:
-.. _api_fluid_io_load_inference_model:
-load_inference_model
--------------------
-..  autofunction:: paddle.fluid.io.load_inference_model
-    :noindex:
-.. _api_fluid_io_get_inference_program:
-get_inference_program
---------------------
-..  autofunction:: paddle.fluid.io.get_inference_program
-    :noindex:
-.. _api_fluid_io_save_checkpoint:
-save_checkpoint
---------------
-..  autofunction:: paddle.fluid.io.save_checkpoint
-    :noindex:
-.. _api_fluid_io_load_checkpoint:
-load_checkpoint
---------------
-..  autofunction:: paddle.fluid.io.load_checkpoint
-    :noindex:
-.. _api_fluid_io_clean_checkpoint:
-clean_checkpoint
----------------
-..  autofunction:: paddle.fluid.io.clean_checkpoint
-    :noindex:
-.. _api_fluid_io_load_persist_vars_without_grad:
-load_persist_vars_without_grad
------------------------------
-..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
-    :noindex:
-.. _api_fluid_io_save_persist_vars_without_grad:
-save_persist_vars_without_grad
------------------------------
-..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
-    :noindex:
-.. _api_fluid_io_get_latest_checkpoint_serial:
-get_latest_checkpoint_serial
----------------------------
-..  autofunction:: paddle.fluid.io.get_latest_checkpoint_serial
-    :noindex:
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
--- a/doc/fluid/api/metrics.rst
+++ b/doc/fluid/api/metrics.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-=============
-fluid.metrics
-=============
-.. _api_fluid_metrics_MetricBase:
-MetricBase
----------
-..  autoclass:: paddle.fluid.metrics.MetricBase
-    :members:
-    :noindex:
-.. _api_fluid_metrics_CompositeMetric:
-CompositeMetric
---------------
-..  autoclass:: paddle.fluid.metrics.CompositeMetric
-    :members:
-    :noindex:
-.. _api_fluid_metrics_Precision:
-Precision
---------
-..  autoclass:: paddle.fluid.metrics.Precision
-    :members:
-    :noindex:
-.. _api_fluid_metrics_Recall:
-Recall
------
-..  autoclass:: paddle.fluid.metrics.Recall
-    :members:
-    :noindex:
-.. _api_fluid_metrics_Accuracy:
-Accuracy
--------
-..  autoclass:: paddle.fluid.metrics.Accuracy
-    :members:
-    :noindex:
-.. _api_fluid_metrics_ChunkEvaluator:
-ChunkEvaluator
--------------
-..  autoclass:: paddle.fluid.metrics.ChunkEvaluator
-    :members:
-    :noindex:
-.. _api_fluid_metrics_EditDistance:
-EditDistance
------------
-..  autoclass:: paddle.fluid.metrics.EditDistance
-    :members:
-    :noindex:
-.. _api_fluid_metrics_DetectionMAP:
-DetectionMAP
------------
-..  autoclass:: paddle.fluid.metrics.DetectionMAP
-    :members:
-    :noindex:
-.. _api_fluid_metrics_Auc:
-Auc
---
-..  autoclass:: paddle.fluid.metrics.Auc
-    :members:
-    :noindex:
--- a/doc/fluid/api/nets.rst
+++ b/doc/fluid/api/nets.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-==========
-fluid.nets
-==========
-.. _api_fluid_nets_simple_img_conv_pool:
-simple_img_conv_pool
--------------------
-..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
-    :noindex:
-.. _api_fluid_nets_sequence_conv_pool:
-sequence_conv_pool
------------------
-..  autofunction:: paddle.fluid.nets.sequence_conv_pool
-    :noindex:
-.. _api_fluid_nets_glu:
-glu
---
-..  autofunction:: paddle.fluid.nets.glu
-    :noindex:
-.. _api_fluid_nets_scaled_dot_product_attention:
-scaled_dot_product_attention
----------------------------
-..  autofunction:: paddle.fluid.nets.scaled_dot_product_attention
-    :noindex:
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-===============
-fluid.optimizer
-===============
-.. _api_fluid_optimizer_SGD:
-SGD
---
-..  autoclass:: paddle.fluid.optimizer.SGD
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_Momentum:
-Momentum
--------
-..  autoclass:: paddle.fluid.optimizer.Momentum
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_Adagrad:
-Adagrad
-------
-..  autoclass:: paddle.fluid.optimizer.Adagrad
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_Adam:
-Adam
----
-..  autoclass:: paddle.fluid.optimizer.Adam
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_Adamax:
-Adamax
------
-..  autoclass:: paddle.fluid.optimizer.Adamax
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_DecayedAdagrad:
-DecayedAdagrad
--------------
-..  autoclass:: paddle.fluid.optimizer.DecayedAdagrad
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_Ftrl:
-Ftrl
----
-..  autoclass:: paddle.fluid.optimizer.Ftrl
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_SGDOptimizer:
-SGDOptimizer
------------
-..  autoclass:: paddle.fluid.optimizer.SGDOptimizer
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_MomentumOptimizer:
-MomentumOptimizer
-----------------
-..  autoclass:: paddle.fluid.optimizer.MomentumOptimizer
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_AdagradOptimizer:
-AdagradOptimizer
----------------
-..  autoclass:: paddle.fluid.optimizer.AdagradOptimizer
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_AdamOptimizer:
-AdamOptimizer
-------------
-..  autoclass:: paddle.fluid.optimizer.AdamOptimizer
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_AdamaxOptimizer:
-AdamaxOptimizer
---------------
-..  autoclass:: paddle.fluid.optimizer.AdamaxOptimizer
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_DecayedAdagradOptimizer:
-DecayedAdagradOptimizer
-----------------------
-..  autoclass:: paddle.fluid.optimizer.DecayedAdagradOptimizer
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_RMSPropOptimizer:
-RMSPropOptimizer
----------------
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_FtrlOptimizer:
-FtrlOptimizer
-------------
-..  autoclass:: paddle.fluid.optimizer.FtrlOptimizer
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_Adadelta:
-Adadelta
--------
-..  autoclass:: paddle.fluid.optimizer.Adadelta
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_ModelAverage:
-ModelAverage
------------
-..  autoclass:: paddle.fluid.optimizer.ModelAverage
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_Optimizer:
-Optimizer
---------
-..  autoclass:: paddle.fluid.optimizer.Optimizer
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_RMSPropOptimizer:
-RMSPropOptimizer
----------------
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
-    :members:
-    :noindex:
--- a/doc/fluid/api/param_attr.rst
+++ b/doc/fluid/api/param_attr.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-================
-fluid.param_attr
-================
-.. _api_fluid_param_attr_ParamAttr:
-ParamAttr
---------
-..  autoclass:: paddle.fluid.param_attr.ParamAttr
-    :members:
-    :noindex:
-.. _api_fluid_param_attr_WeightNormParamAttr:
-WeightNormParamAttr
-------------------
-..  autoclass:: paddle.fluid.param_attr.WeightNormParamAttr
-    :members:
-    :noindex:
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-==============
-fluid.profiler
-==============
-.. _api_fluid_profiler_cuda_profiler:
-cuda_profiler
-------------
-..  autofunction:: paddle.fluid.profiler.cuda_profiler
-    :noindex:
-.. _api_fluid_profiler_reset_profiler:
-reset_profiler
--------------
-..  autofunction:: paddle.fluid.profiler.reset_profiler
-    :noindex:
-.. _api_fluid_profiler_profiler:
-profiler
--------
-..  autofunction:: paddle.fluid.profiler.profiler
-    :noindex:
-.. _api_fluid_profiler_start_profiler:
-start_profiler
--------------
-..  autofunction:: paddle.fluid.profiler.start_profiler
-    :noindex:
-.. _api_fluid_profiler_stop_profiler:
-stop_profiler
-------------
-..  autofunction:: paddle.fluid.profiler.stop_profiler
-    :noindex:
--- a/doc/fluid/api/recordio_writer.rst
+++ b/doc/fluid/api/recordio_writer.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-=====================
-fluid.recordio_writer
-=====================
-.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
-convert_reader_to_recordio_file
-------------------------------
-..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
-    :noindex:
-.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
-convert_reader_to_recordio_files
--------------------------------
-..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
-    :noindex:
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-=================
-fluid.regularizer
-=================
-.. _api_fluid_regularizer_append_regularization_ops:
-append_regularization_ops
-------------------------
-..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
-    :noindex:
-.. _api_fluid_regularizer_L1Decay:
-L1Decay
-------
-..  autoclass:: paddle.fluid.regularizer.L1Decay
-    :members:
-    :noindex:
-.. _api_fluid_regularizer_L2Decay:
-L2Decay
-------
-..  autoclass:: paddle.fluid.regularizer.L2Decay
-    :members:
-    :noindex:
-.. _api_fluid_regularizer_L1DecayRegularizer:
-L1DecayRegularizer
------------------
-..  autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
-    :members:
-    :noindex:
-.. _api_fluid_regularizer_L2DecayRegularizer:
-L2DecayRegularizer
------------------
-..  autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
-    :members:
-    :noindex:
--- a/doc/fluid/api/transpiler.rst
+++ b/doc/fluid/api/transpiler.rst
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-================
-fluid.transpiler
-================
-.. _api_fluid_transpiler_DistributeTranspiler:
-DistributeTranspiler
--------------------
-..  autoclass:: paddle.fluid.transpiler.DistributeTranspiler
-    :members:
-    :noindex:
-.. _api_fluid_transpiler_InferenceTranspiler:
-InferenceTranspiler
-------------------
-..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
-    :members:
-    :noindex:
-.. _api_fluid_transpiler_memory_optimize:
-memory_optimize
---------------
-..  autofunction:: paddle.fluid.transpiler.memory_optimize
-    :noindex:
-.. _api_fluid_transpiler_release_memory:
-release_memory
--------------
-..  autofunction:: paddle.fluid.transpiler.release_memory
-    :noindex:
-.. _api_fluid_transpiler_HashName:
-HashName
--------
-..  autoclass:: paddle.fluid.transpiler.HashName
-    :members:
-    :noindex:
-.. _api_fluid_transpiler_RoundRobin:
-RoundRobin
----------
-..  autoclass:: paddle.fluid.transpiler.RoundRobin
-    :members:
-    :noindex:
--- a/doc/fluid/build_and_install/build_from_source_cn.rst
+++ b/doc/fluid/build_and_install/build_from_source_cn.rst
-../../v2/build_and_install/build_from_source_cn.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/build_from_source_en.rst
+++ b/doc/fluid/build_and_install/build_from_source_en.rst
-../../v2/build_and_install/build_from_source_en.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/docker_install_cn.rst
+++ b/doc/fluid/build_and_install/docker_install_cn.rst
-../../v2/build_and_install/docker_install_cn.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/docker_install_en.rst
+++ b/doc/fluid/build_and_install/docker_install_en.rst
-../../v2/build_and_install/docker_install_en.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/index_cn.rst
+++ b/doc/fluid/build_and_install/index_cn.rst
-../../v2/build_and_install/index_cn.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/index_en.rst
+++ b/doc/fluid/build_and_install/index_en.rst
-../../v2/build_and_install/index_en.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/paddleci.png
+++ b/doc/fluid/build_and_install/paddleci.png
-../../v2/build_and_install/paddleci.png
\ No newline at end of file
--- a/doc/fluid/build_and_install/pip_install_cn.rst
+++ b/doc/fluid/build_and_install/pip_install_cn.rst
-../../v2/build_and_install/pip_install_cn.rst
\ No newline at end of file
--- a/doc/fluid/build_and_install/pip_install_en.rst
+++ b/doc/fluid/build_and_install/pip_install_en.rst
-../../v2/build_and_install/pip_install_en.rst
\ No newline at end of file
--- a/doc/fluid/design/algorithm/images/asgd.gif
+++ b/doc/fluid/design/algorithm/images/asgd.gif
--- a/doc/fluid/design/algorithm/images/theta_star.gif
+++ b/doc/fluid/design/algorithm/images/theta_star.gif
--- a/doc/fluid/design/algorithm/index_cn.rst
+++ b/doc/fluid/design/algorithm/index_cn.rst
-梯度更新算法
------------
-.. toctree::
-  :maxdepth: 1
-  parameter_average.md
--- a/doc/fluid/design/algorithm/index_en.rst
+++ b/doc/fluid/design/algorithm/index_en.rst
-Gradient Update Algorithm
--------------------------------------
-.. toctree::
-  :maxdepth: 1
-  parameter_average.md
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
-# Averaging Parameter in PaddlePaddle
-## Why Averaging
-In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible.
-Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
-Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/theta_star.gif"/><br/> . The averaging is done as follows:
-<p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/asgd.gif"><br />
-</p>
-We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
-### How to perform Parameter Averaging in PaddlePaddle
-Parameter Averaging in PaddlePaddle works in the following way during training :
-1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer
-2. The optimizer itself is responsible for updating the parameters.
-3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
-    1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches.
-    2. However, saving all N instances of the parameters in memory is not feasible.
-    3. Therefore, an approximation algorithm is used.
-Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
-During the testing/saving the model phase, we perform the following steps:
-1. Perform the delayed operations.
-2. Save current values of the parameters to a temporary variable.
-3. Replace the values of the parameters with the averaged values.
-4. Perform testing and/or save the parameters.
-5. Restore the values of the parameters once done.
-### How to implement Averaging of Parameter in PaddlePaddle
-We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training.
-	**Advantages**:
-    - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op.
-    - Makes it easy for the users to customize and extend the framework.
-	**Disadvantages**:
-    - Implementation requires re-writing the averaging methodology in Python.  
-### Low-Level implementation
-In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input:
- the optimizer
- the window_size to keep the updates
-The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
-The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
-### Python API implementation for ParameterAverageOptimizer
-Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following:
- Any optimizer (RMSProp , AdaGrad etc.)
- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
-Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
-We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.cc)
-#### Creation of the ParameterAverageOptimizer operator
-There are two ways for creating the ParameterAverageOptimizer op:
-1. We create the op immediately while building the computation graph.
-2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added.
-The proposal is to add the op immediately while building the computation graph.
-#### High-level API
-In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
--- a/doc/fluid/design/concepts/README.md
+++ b/doc/fluid/design/concepts/README.md
-A few months ago when we were trying to replace CMake with Bazel, @emailweixu suggested that we rewrite those handy Bazel functions using CMake. Now it seems that it's the right time to get this done, as we are facing problems from the porting of Majel and the development of new the parameter server using Go and C++.
-Here are some initial thoughts. Your comments are welcome!
-# Required CMake Function
-I think we need only the following few CMake functions to make a project description mean and clean:
-<table>
-<thead>
-<tr>
-<th>C++</th>
-<th>CUDA C++</th>
-<th>Go</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td>cc_library </td>
-<td>nv_library </td>
-<td>go_library </td>
-</tr>
-<tr>
-<td>cc_binary </td>
-<td>nv_binary </td>
-<td>go_binary </td>
-</tr>
-<tr>
-<td> cc_test </td>
-<td> nv_test </td>
-<td> go_test </td>
-</tr>
-</tbody>
-</table>
- The `_library` functions generate  .a files from source code.
- The `_binary` functions generate executable binary files.
- The `_test` functions generate executable unit test files. They work like `_binary` but links `-lgtest` and `-lgtest_main`.
-The difference between `nv_` functions and `cc_` functions is that the former use `nvcc` instead of the system-default C++ compiler.
-Both `nv_` and `cc_` functions enables C++11 (-std=c++11).
-Also,
- to describe external dependencies, we need `external_library`.
- to build shared libraries, we need `shared_library`.
-## An Example Project
-Suppose that we have aforementioned functions defined in our `/cmake` directory.  The following example `CMakeLists.txt` describes a project including the following source files:
- tensor.h
- tensor.cc
- tensor_test.cc
- ops.h
- ops.cu
- ops_test.cu
- api.go
- api_test.go
-Suppose that ops.cu depends on CUDNN.
-```cmake
-# cc_binary parses tensor.cc and figures out that target also depend
-# on tensor.h.
-cc_binary(tensor
-  SRCS
-  tensor.cc)
-# The dependency to target tensor implies that if any of
-# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
-cc_test(tensor_test
-  SRCS
-  tensor_test.cc
-  DEPS
-  tensor)
-# I don't have a clear idea what parameters external_library need to
-# have.  @gangliao as a CMake expert would have better ideas.
-external_library(cudnn
-  ....)
-# Suppose that ops.cu depends on external target CUDNN.  Also, ops.cu
-# include global functions that take Tensor as their parameters, so
-# ops depend on tensor.  This implies that if any of tensor.{h.cc},
-# ops.{h,cu} is changed, ops need to be re-built.
-nv_library(ops
-  SRCS
-  ops.cu
-  DEPS
-  tensor
-  cudnn)  # cudnn is defined later.
-nv_test(ops_test
-  SRCS
-  ops_test.cu
-  DEPS
-  ops)
-# Because api.go defines a GO wrapper to ops and tensor, it depends on
-# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
-# api.go is changed, api need to be re-built.
-go_library(api
-  SRCS
-  api.go
-  DEPS
-  tensor # Because ops depend on tensor, this line is optional.
-  ops)
-go_test(api_test
-  SRCS
-  api_test.go
-  DEPS
-  api)
-# This builds libapi.so.  shared_library might use CMake target
-# api_shared so to distinguish it from above target api.
-shared_library(api
-  DEPS
-  api)
-```
-## Implementation
-As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
-## Using Package Manager For Go
-Building Go binaries and libraries need to satisfy their dependencies, generally
-we can do `go get ./...` to download and compile all external dependencies. The
-problems are:
-1. `go get` will always get the latest code from the default branch of the
-    remote repo, so changes of dependents might break the build. This is very
-    different with what we already have in `cmake/external` which download a
-    specific version or commit id of the dependency.
-1. Some locations can not access external dependencies through the internet, as mentioned
-   in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management
-   tools can package the dependencies as a "vendor" package, which can be mirrored
-   at many cloud file hosting, so users what to compile paddle by themselves can
-   download this "vendor" package from a mirror site.
-### Choose A Suitable Tool
-As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
-list dozens of Go package managers. We choose the tool using following principles:
- Most "active" projects with more stars, more pull requests or commits
- Widely used project
-After comparing all these projects, we shall choose between the most popular
-tools: Godep and Glide.
-Here's a brief comparison between Godep and Glide
-: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are
-also many complaints about using `Godep`. There's also a new "official" pakcage
-management tool has been started at: https://github.com/golang/dep to resolve
-such problems, but it's currently at Alpha stage. So the best choice now is
-glide obviously.
-### Manage Go Packages
- Dependencies: `go/glide.yaml` will store the dependencies and their versions which
-  is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
-  with their commit id. Builds will "lock" to these packages if we don't `glide up`
-  them
- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake`
-  will download the code corresponding to `go/glide.lock`. If we put a vendor folder
-  under `go/`, cmake will just check the commit id to the packages under the folder,
-  if commit id matches, there will be no download at all.
--- a/doc/fluid/design/concepts/block.md
+++ b/doc/fluid/design/concepts/block.md
-# Design Doc: Block and Scope
-## The Representation of Computation
-Both deep learning systems and programming languages help users describe computation procedures.  These systems use various representations of computation:
- Caffe, Torch, and Paddle: sequences of layers.
- TensorFlow, Caffe2, Mxnet: graph of operators.
- PaddlePaddle: nested blocks, like C++ and Java programs.
-## Block in Programming Languages and Deep Learning
-In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators.
-Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
-<table>
-<thead>
-<tr>
-<th>programming languages</th>
-<th>PaddlePaddle</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td>for, while loop </td>
-<td>RNN, WhileOp </td>
-</tr>
-<tr>
-<td>if, if-else, switch </td>
-<td>IfElseOp, SwitchOp </td>
-</tr>
-<tr>
-<td>sequential execution </td>
-<td>a sequence of layers </td>
-</tr>
-</tbody>
-</table>
-A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
-## Stack Frames and the Scope Hierarchy
-The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
-<table>
-<thead>
-<tr>
-<th>programming languages</th>
-<th>PaddlePaddle</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td>stack </td>
-<td>scope hierarchy </td>
-</tr>
-<tr>
-<td>stack frame  </td>
-<td>scope </td>
-</tr>
-<tr>
-<td>push at entering block </td>
-<td>push at entering block </td>
-</tr>
-<tr>
-<td>pop at leaving block </td>
-<td>destroy when minibatch completes </td>
-</tr>
-</tbody>
-</table>
-1. In traditional programs:
-   - When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables.
-   - After the execution leaves the right curly brace, the runtime pops the frame.
-   - The maximum number of frames in the stack is the maximum depth of nested blocks.
-1. In PaddlePaddle
-   - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
-   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
-   - The height of the highest tree is the maximum depth of nested blocks.
-   - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy.
-## Use Blocks in C++ and PaddlePaddle Programs
-Let us consolidate the discussion by presenting some examples.
-### Blocks with `if-else` and `IfElseOp`
-The following C++ programs shows how blocks are used with the `if-else` structure:
-```c++
-namespace pd = paddle;
-int x = 10;
-int y = 1;
-int z = 10;
-bool cond = false;
-int o1, o2;
-if (cond) {
-  int z = x + y;
-  o1 = z;
-  o2 = pd::layer::softmax(z);
-} else {
-  int d = pd::layer::fc(z);
-  o1 = d;
-  o2 = d+1;
-}
-```
-An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](../execution/if_else_op.md) is as follows:
-```python
-import paddle as pd
-x = minibatch([10, 20, 30]) # shape=[None, 1]
-y = var(1) # shape=[1], value=1
-z = minibatch([10, 20, 30]) # shape=[None, 1]
-cond = larger_than(x, 15) # [false, true, true]
-ie = pd.ifelse()
-with ie.true_block():
-    d = pd.layer.add_scalar(x, y)
-    ie.output(d, pd.layer.softmax(d))
-with ie.false_block():
-    d = pd.layer.fc(z)
-    ie.output(d, d+1)
-o1, o2 = ie(cond)
-```
-In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` .
-The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.
-### Blocks with `for` and `RNNOp`
-The following RNN model in PaddlePaddle from the [RNN design doc](../dynamic_rnn/rnn.md) :
-```python
-x = sequence([10, 20, 30]) # shape=[None, 1]
-m = var(0) # shape=[1]
-W = var(0.314, param=true) # shape=[1]
-U = var(0.375, param=true) # shape=[1]
-rnn = pd.rnn()
-with rnn.step():
-  h = rnn.memory(init = m)
-  h_prev = rnn.previous_memory(h)
-  a = layer.fc(W, x)
-  b = layer.fc(U, h_prev)  
-  s = pd.add(a, b)
-  act = pd.sigmoid(s)
-  rnn.update_memory(h, act)
-  rnn.output(a, b)
-o1, o2 = rnn()
-```
-has its equivalent C++ program as follows
-```c++
-int* x = {10, 20, 30};
-int* m = {0};
-int* W = {0.314};
-int* U = {0.375};
-int mem[sizeof(x) / sizeof(x[0]) + 1];
-int o1[sizeof(x) / sizeof(x[0]) + 1];
-int o2[sizeof(x) / sizeof(x[0]) + 1];
-for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
-  int x = x[i-1];
-  if (i == 1) mem[0] = m;
-  int a = W * x;
-  int b = Y * mem[i-1];
-  int s = fc_out + hidden_out;
-  int act = sigmoid(sum);
-  mem[i] = act;
-  o1[i] = act;
-  o2[i] = hidden_out;
-}
-```
-## Compilation and Execution
-Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference.
-The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file.
-## The "Binary Executable File Format"
-The definition of the protobuf message is as follows:
-```protobuf
-message BlockDesc {
-  repeated VarDesc vars = 1;
-  repeated OpDesc ops = 2;
-}
-```
-The step net in above RNN example would look like
-```
-BlockDesc {
-  vars = {
-    VarDesc {...} // x
-    VarDesc {...} // h
-    VarDesc {...} // fc_out
-    VarDesc {...} // hidden_out
-    VarDesc {...} // sum
-    VarDesc {...} // act
-  }
-  ops = {
-    OpDesc {...} // matmul
-    OpDesc {...} // add_two
-    OpDesc {...} // sigmoid
-  }
-};
-```
-Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like:
-```
-OpDesc {
-  inputs = {0} // the index of x in vars of BlockDesc above
-  outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
-  attrs {
-    "states" : {1} // the index of h
-    "step_net" : <above step net>
-  }
-};
-```
-This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block.
-## The Compilation of Blocks
-During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
-VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope.
-Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example:
-```python
-a = pd.Variable(shape=[20, 20])
-b = pd.fc(a, params=["fc.w", "fc.b"])
-rnn = pd.create_rnn()
-with rnn.stepnet():
-    x = a.as_step_input()
-    # reuse fc's parameter
-    fc_without_b = pd.get_variable("fc.w")
-    rnn.output(fc_without_b)
-out = rnn()
-```
-The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
-In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.
-To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.
-`SymbolTable` can do the following:
- store the definitions (some names and attributes) of variables and operators,
- verify if a variable was declared,
- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
-```c++
-// Information in SymbolTable is enough to trace the dependency graph. So maybe
-// the Eval() interface takes a SymbolTable is enough.
-class SymbolTable {
- public:
-  SymbolTable(SymbolTable* parent) : parent_(parent) {}
-  OpDesc* NewOp(const string& name="");
-  // TODO determine whether name is generated by python or C++.
-  // Currently assume that a unique name will be generated by C++ if the
-  // argument name is left default.
-  VarDesc* Var(const string& name="");
-  // find a VarDesc by name, if recursive is true, find parent's SymbolTable
-  // recursively.
-  // this interface is introduced to support InferShape, find protobuf messages
-  // of variables and operators, pass pointers into InferShape.
-  //
-  // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
-  // be proposed and embedded into pybind to enable python operation on C++ pointers.
-  VarDesc* FindVar(const string& name, bool recursive=true);
-  OpDesc* FindOp(const string& name);
-  BlockDesc Compile() const;
- private:
-  SymbolTable* parent_;
-  map<string, OpDesc> ops_;
-  map<string, VarDesc> vars_;
-};
-```
-After all the description of variables and operators is added into SymbolTable,
-the block has enough information to run.
-The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions.
-```c++
-namespace {
-class Block : OperatorBase {
-public:
-  Block(const BlockDesc& desc) desc_(desc) {}
-  void InferShape(const framework::Scope& scope) const override {
-    if (!symbols_ready_) {
-      CreateVariables(scope);
-      CreateOperators();
-    }
-    // should run InferShape first.
-    for (auto& op : runtime_table_.ops()) {
-      op->InferShape(scope);
-    }
-  }
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override {
-    PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
-    for (auto& op : runtime_table_.ops()) {
-      op->Run(scope, place);
-    }
-  }
-  void CreateVariables(const framework::Scope& scope);
-  void CreateOperators();
-  // some other necessary interfaces of NetOp are listed below
-  // ...
-private:
-  BlockDesc desc_;
-  bool symbols_ready_{false};
-};
-```
-## The Execution of Blocks
-Block inherits from OperatorBase, which has a Run method.
-Block's Run method will run its operators sequentially.
-There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets.
-The definition of Eval is as follows:
-```c++
-// clean a block description by targets using the corresponding dependency graph.
-// return a new BlockDesc with minimal number of operators.
-// NOTE: The return type is not a Block but the block's description so that this can be distributed
-// to a cluster.
-BlockDesc Prune(const BlockDesc& desc, vector<string> targets);
-void Block::Eval(const vector<string>& targets,
-                 const framework::Scope& scope,
-                 const platform::DeviceContext& dev_ctx) {
-  BlockDesc min_desc = Prune(desc_, targets);
-  Block min_block(min_desc);
-  min_block.Run(scope, dev_ctx);
-}
-```
--- a/doc/fluid/design/concepts/cpp_data_feeding.md
+++ b/doc/fluid/design/concepts/cpp_data_feeding.md
-# C++ Data Feeding
-While using Paddle V2 API for training, data feeding completely depends on the Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required.
-In this document, we show the fundamental design of a C++ data feeding process, which includes data reading, shuffling and batching.
-## Overview
-![](images/readers.png)
-## Reader
-In order to handle the above-mentioned problem, a new concept called 'Reader' is introduced. `Reader` is a series of inherited classes which can be held by our `Variable` and they are used to read or process file data.
-### ReaderBase
-`ReaderBase` is the abstract base class for all readers. It defines the interface for all readers.
-```cpp
-class ReaderBase {
- public:
-  // Reads the next batch of data. (A 'batch' can be only one instance)
-  // If the next batch doesn't exist, it throws an exception
-  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
-  // Checks whether the next instance exists.
-  virtual bool HasNext() = 0;
-  // Reinitializes the reader and read the file from the beginning.
-  virtual void ReInit() = 0;
-  virtual ~ReaderBase();
-};
-```
-### FileReader
-`FileReader` is derived from the `ReaderBase`. It is still an abstract class and will further be derived by Readers of respective specific format.
-```cpp
-class FileReader : public ReaderBase {
- public:
-  explicit FileReader(const std::vector<DDim>& dims);
-  void ReadNext(std::vector<LoDTensor>* out) override;
- protected:
-  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
- private:
-  std::vector<DDim> dims_;
-};
-```
-A file reader binds with a single file and reads one data instance at a time. Each type of file reader shall implement its own `ReadNextImpl()`, `HasNext()` and `ReInit()`.
-The `ReadNextImpl()` is invoked by `ReadNext()`. Besides invoking `ReadNextImpl()`, `ReadNext()` is also responsible for checking the output, making sure that each shape of `LoDTensor` in `*out` is consistent with the one in `dims_`.  
-### DecoratedReader
-A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some processing on them(shuffling,  batching or something else), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
-```cpp
-class DecoratedReader : public ReaderBase {
- public:
-  explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
-    PADDLE_ENFORCE_NOT_NULL(reader_);
-  }
-  void ReInit() override { reader_->ReInit(); }
-  bool HasNext() const override { return reader_->HasNext(); }
- protected:
-  ReaderBase* reader_;
-};
-```
-Both the `FileReader` and `DecoratedReader` share exactly the same interface as defined in `ReaderBase`. So they can be decorated for multiple times: We can **shuffle** a reader's outputs and then **batch** the shuffled outputs. The interface consistency also allows related ops use readers without knowing their underlying type.
-### MultipleReader
-All `FileReader` binds with a single file and are single-threaded. However, sometimes we need to read data from more than one file. In this case, it's not enough to only have `FileReader` and `DecoratedReader`.
-So `MultipleReader` is introduced. It is also derived from `ReaderBase`. A `MultipleReader` holds several prefetching `FileReaders` and these readers run concurrently. Another pivotal part of a `MultipleReader` is a buffer channel. The channel collects data yield by all prefetching readers and makes subsequent OPs or decorated readers be able to fetch data without concerning about multiple readers scheduling.
-![](images/multiple_reader.png)
-This graph shows how a `MultipleReader` works with three prefetching file readers and two GPUs. There is a queue of files which are going to be read. Each time when a prefetching file reader is free(complete reading from one file), it fetches a new file from the queue. Each prefetching file reader runs in a separated prefetch thread and dumps their outputs to the same channel.
-To the subsequent two decorated readers, the `MultipleReader` is **a single reader**. They don't need to concern about how prefetch readers are scheduled. They only need to invoke `MultipleReader::ReadNext()` to get the next data from the buffer channel. 
-### ReaderHolder
-Different readers belong to different class types. This leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
-```cpp
-var->Get<ReaderBase>("batch_reader");
-```
-We would have to write:
-```cpp
-var->Get<BatchReader>("batch_reader");
-```
-This requires that in order to get a reader from a variable, every time, we must know the reader's type exactly. This is nearly impossible.
-To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which hides reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
-## Related Operators
-To create and invoke readers, some new ops are introduced:
-### Operators That Create Readers
-Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
-However, direct usage of file readers' creation ops is not recommended because a file reader can only read one file via a single thread. Using `OpenFilesOp` is a better choice.
-### OpenFilesOp
-The `OpenFilesOp` is the creation op of `MultipleReader`. It takes no input but requires a list of file names as one of its attributes. The newly created `MultipleReader` then creates its own prefetching readers according to given file names.
-To make sure that created prefetching readers match file formats, we need a name prefix rule to append file format tags to file names, as well as a file reader registry mechanism to map file format tags to their corresponding file readers' constructors.
-### HasNextOp
-`HasNextOp` is used to check whether the next data batch exists via the reader's `HasNext()` interface.
-### ResetOp
-`ResetOp` is used to reset a reader via its `ReInit()` interface.
-### ReadOp
-A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
-## Program with Readers
-A `Program` holds readers as its persistable variables. These variables are created by `CreateReaderOp` or `OpenFilesOp`. These ops shall run only once. So they shall be settled in the `startup_program`. `HasNextOp`, `ResetOp` and `ReadOp` are required by training loop, so they shall be in the `main_program`.
-The ops of a `startup_program` with readers would be like this:
-```
-multiple_reader = open_files_op(...)
-batch_reader = create_batch_reader_op(multiple_reader)
-double_buffer_reader = create_double_buffer_op(batch_reader)
-... (other initializers)
-```
-The forwarding ops of the corresponding `main_program` would be like this:
-```
-not_completed = true
-pass_count = 0
-while_op(not_completed) {
-    has_next = has_next_op(double_buffer_reader)
-    if_else_op(has_next) {
-        batch_data = read_op(double_buffer_reader)
-        ... (subsequent training ops)
-    } else {
-        reset_op(double_buffer_reader)
-        increase_op(pass_count)
-        not_completed = less_than_op(pass_count, reqiured_pass_num)
-    }
-}
-```
-A few important considerations for these programs are as follows:
-1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables.
-2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
-3. All readers exist in both `startup_program` and `main_program`. And they are persistable.
-### Simplify Configuration by MultiPassReader
-The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`.
-`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`.
-With `MultiPassReader`, the startup program would be like this:
-```
-multiple_reader = open_files_op(...)
-batch_reader = create_batch_reader_op(multiple_reader)
-multi_pass_reader = create_multi_pass_reader_op(batch_reader)
-double_buffer_reader = create_double_buffer_op(multi_pass_reader)
-... (other initializers)
-```
-The forwarding part of the corresponding `main_program` would be like this:
-```
-not_completed = true
-while_op(not_completed) {
-    batch_data = read_op(double_buffer_reader)
-    ... (subsequent training ops)
-    not_completed = has_next_op(double_buffer_reader)
-}
-```
--- a/doc/fluid/design/concepts/executor.md
+++ b/doc/fluid/design/concepts/executor.md
-# Executor Design Doc
-## Motivation
-In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
-[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
-The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
-## Overview
-An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
-## Executor
-The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
-It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
-### The interface
-```c++
-  Executor(places);
-```
-A executor does not own any computing resources, a user can only construct an executor using the specified places.
-### Running an Executor
-```
-  void Run(ProgramDesc, Scope, block_id, create_local_scope);
-```
-An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
--- a/doc/fluid/design/concepts/images/multiple_reader.png
+++ b/doc/fluid/design/concepts/images/multiple_reader.png
--- a/doc/fluid/design/concepts/images/parallel_executor_overview.dot
+++ b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
--- a/doc/fluid/design/concepts/images/parallel_executor_overview.png
+++ b/doc/fluid/design/concepts/images/parallel_executor_overview.png
--- a/doc/fluid/design/concepts/images/readers.png
+++ b/doc/fluid/design/concepts/images/readers.png
--- a/doc/fluid/design/concepts/index_cn.rst
+++ b/doc/fluid/design/concepts/index_cn.rst
--- a/doc/fluid/design/concepts/index_en.rst
+++ b/doc/fluid/design/concepts/index_en.rst
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
--- a/doc/fluid/design/concepts/parallel_executor.md
+++ b/doc/fluid/design/concepts/parallel_executor.md
--- a/doc/fluid/design/concepts/program.md
+++ b/doc/fluid/design/concepts/program.md
--- a/doc/fluid/design/concepts/python_data_feeding.md
+++ b/doc/fluid/design/concepts/python_data_feeding.md
--- a/doc/fluid/design/concepts/scope.md
+++ b/doc/fluid/design/concepts/scope.md
--- a/doc/fluid/design/concepts/tensor.md
+++ b/doc/fluid/design/concepts/tensor.md
--- a/doc/fluid/design/concepts/tensor_array.md
+++ b/doc/fluid/design/concepts/tensor_array.md
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
--- a/doc/fluid/design/concepts/variable.md
+++ b/doc/fluid/design/concepts/variable.md
--- a/doc/fluid/design/concurrent/channel.md
+++ b/doc/fluid/design/concurrent/channel.md
--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
--- a/doc/fluid/design/concurrent/csp.md
+++ b/doc/fluid/design/concurrent/csp.md
--- a/doc/fluid/design/concurrent/go_op.md
+++ b/doc/fluid/design/concurrent/go_op.md
--- a/doc/fluid/design/concurrent/images/channel_recv.png
+++ b/doc/fluid/design/concurrent/images/channel_recv.png
--- a/doc/fluid/design/concurrent/images/channel_send.png
+++ b/doc/fluid/design/concurrent/images/channel_send.png
--- a/doc/fluid/design/concurrent/images/select_op_workflow.png
+++ b/doc/fluid/design/concurrent/images/select_op_workflow.png
--- a/doc/fluid/design/concurrent/index_cn.rst
+++ b/doc/fluid/design/concurrent/index_cn.rst
-并发编程
------------
-.. toctree::
-  :maxdepth: 1
-  concurrent_programming.md
-  parallel_do.md
--- a/doc/fluid/design/concurrent/index_en.rst
+++ b/doc/fluid/design/concurrent/index_en.rst
--- a/doc/fluid/design/concurrent/parallel_do.md
+++ b/doc/fluid/design/concurrent/parallel_do.md
--- a/doc/fluid/design/concurrent/select_op.md
+++ b/doc/fluid/design/concurrent/select_op.md
--- a/doc/fluid/design/data_type/float16.md
+++ b/doc/fluid/design/data_type/float16.md
--- a/doc/fluid/design/data_type/index_cn.rst
+++ b/doc/fluid/design/data_type/index_cn.rst
--- a/doc/fluid/design/data_type/index_en.rst
+++ b/doc/fluid/design/data_type/index_en.rst
--- a/doc/fluid/design/dist_train/README.md
+++ b/doc/fluid/design/dist_train/README.md
--- a/doc/fluid/design/dist_train/async_update.md
+++ b/doc/fluid/design/dist_train/async_update.md
--- a/doc/fluid/design/dist_train/dist_train_nccl2.md
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
--- a/doc/fluid/design/dist_train/distributed_architecture.md
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
--- a/doc/fluid/design/dist_train/distributed_traing_review.md
+++ b/doc/fluid/design/dist_train/distributed_traing_review.md
--- a/doc/fluid/design/dist_train/index_cn.rst
+++ b/doc/fluid/design/dist_train/index_cn.rst
--- a/doc/fluid/design/dist_train/index_en.rst
+++ b/doc/fluid/design/dist_train/index_en.rst
--- a/doc/fluid/design/dist_train/mpi_enabled_design.md
+++ b/doc/fluid/design/dist_train/mpi_enabled_design.md
--- a/doc/fluid/design/dist_train/multi_cpu.md
+++ b/doc/fluid/design/dist_train/multi_cpu.md
--- a/doc/fluid/design/dist_train/parameter_server.md
+++ b/doc/fluid/design/dist_train/parameter_server.md
--- a/doc/fluid/design/dist_train/src/async_distributed_training.png
+++ b/doc/fluid/design/dist_train/src/async_distributed_training.png
--- a/doc/fluid/design/dist_train/src/async_pserver.graffle
+++ b/doc/fluid/design/dist_train/src/async_pserver.graffle
--- a/doc/fluid/design/dist_train/src/async_pserver.png
+++ b/doc/fluid/design/dist_train/src/async_pserver.png
--- a/doc/fluid/design/dist_train/src/async_update.graffle
+++ b/doc/fluid/design/dist_train/src/async_update.graffle
--- a/doc/fluid/design/dist_train/src/async_update.png
+++ b/doc/fluid/design/dist_train/src/async_update.png
--- a/doc/fluid/design/dist_train/src/compiler.graffle
+++ b/doc/fluid/design/dist_train/src/compiler.graffle
--- a/doc/fluid/design/dist_train/src/compiler.png
+++ b/doc/fluid/design/dist_train/src/compiler.png
--- a/doc/fluid/design/dist_train/src/dist-graph.graffle
+++ b/doc/fluid/design/dist_train/src/dist-graph.graffle
--- a/doc/fluid/design/dist_train/src/dist-graph.png
+++ b/doc/fluid/design/dist_train/src/dist-graph.png
--- a/doc/fluid/design/dist_train/src/distributed_architecture.graffle
+++ b/doc/fluid/design/dist_train/src/distributed_architecture.graffle
--- a/doc/fluid/design/dist_train/src/distributed_architecture.png
+++ b/doc/fluid/design/dist_train/src/distributed_architecture.png
--- a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle
+++ b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle
--- a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg
+++ b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg
--- a/doc/fluid/design/dist_train/src/distributed_training.graffle
+++ b/doc/fluid/design/dist_train/src/distributed_training.graffle
--- a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
+++ b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
--- a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
+++ b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
--- a/doc/fluid/design/dist_train/src/local-graph.graffle
+++ b/doc/fluid/design/dist_train/src/local-graph.graffle
--- a/doc/fluid/design/dist_train/src/local-graph.png
+++ b/doc/fluid/design/dist_train/src/local-graph.png
--- a/doc/fluid/design/dist_train/src/local_architecture.graffle
+++ b/doc/fluid/design/dist_train/src/local_architecture.graffle
--- a/doc/fluid/design/dist_train/src/local_architecture.png
+++ b/doc/fluid/design/dist_train/src/local_architecture.png
--- a/doc/fluid/design/dist_train/src/lookup_table.png
+++ b/doc/fluid/design/dist_train/src/lookup_table.png
--- a/doc/fluid/design/dist_train/src/lookup_table_training.png
+++ b/doc/fluid/design/dist_train/src/lookup_table_training.png
--- a/doc/fluid/design/dist_train/src/mpi_module.png
+++ b/doc/fluid/design/dist_train/src/mpi_module.png
--- a/doc/fluid/design/dist_train/src/multi-threads.graffle
+++ b/doc/fluid/design/dist_train/src/multi-threads.graffle
--- a/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png
+++ b/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png
--- a/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png
+++ b/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png
--- a/doc/fluid/design/dist_train/src/ncc2_design.graffle
+++ b/doc/fluid/design/dist_train/src/ncc2_design.graffle
--- a/doc/fluid/design/dist_train/src/ncc2_design.png
+++ b/doc/fluid/design/dist_train/src/ncc2_design.png
--- a/doc/fluid/design/dist_train/src/paddle-compile.graffle
+++ b/doc/fluid/design/dist_train/src/paddle-compile.graffle
--- a/doc/fluid/design/dist_train/src/paddle-compile.png
+++ b/doc/fluid/design/dist_train/src/paddle-compile.png
--- a/doc/fluid/design/dist_train/src/remote_executor.graffle
+++ b/doc/fluid/design/dist_train/src/remote_executor.graffle
--- a/doc/fluid/design/dist_train/src/remote_executor.png
+++ b/doc/fluid/design/dist_train/src/remote_executor.png
--- a/doc/fluid/design/dist_train/src/sparse_update.graffle
+++ b/doc/fluid/design/dist_train/src/sparse_update.graffle
--- a/doc/fluid/design/dist_train/src/sparse_update.png
+++ b/doc/fluid/design/dist_train/src/sparse_update.png
--- a/doc/fluid/design/dist_train/src/sync_distributed_training.png
+++ b/doc/fluid/design/dist_train/src/sync_distributed_training.png
--- a/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
+++ b/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
--- a/doc/fluid/design/dynamic_rnn/2_level_rnn.png
+++ b/doc/fluid/design/dynamic_rnn/2_level_rnn.png
--- a/doc/fluid/design/dynamic_rnn/index_cn.rst
+++ b/doc/fluid/design/dynamic_rnn/index_cn.rst
--- a/doc/fluid/design/dynamic_rnn/index_en.rst
+++ b/doc/fluid/design/dynamic_rnn/index_en.rst
--- a/doc/fluid/design/dynamic_rnn/rnn.dot
+++ b/doc/fluid/design/dynamic_rnn/rnn.dot
--- a/doc/fluid/design/dynamic_rnn/rnn.jpg
+++ b/doc/fluid/design/dynamic_rnn/rnn.jpg
--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
--- a/doc/fluid/design/dynamic_rnn/rnn.png
+++ b/doc/fluid/design/dynamic_rnn/rnn.png
--- a/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
+++ b/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
--- a/doc/fluid/design/dynamic_rnn/rnn_2level_data.png
+++ b/doc/fluid/design/dynamic_rnn/rnn_2level_data.png
--- a/doc/fluid/design/dynamic_rnn/rnn_design.md
+++ b/doc/fluid/design/dynamic_rnn/rnn_design.md
--- a/doc/fluid/design/dynamic_rnn/rnn_design_en.md
+++ b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
--- a/doc/fluid/design/execution/if_else_op.md
+++ b/doc/fluid/design/execution/if_else_op.md
--- a/doc/fluid/design/execution/index_cn.rst
+++ b/doc/fluid/design/execution/index_cn.rst
--- a/doc/fluid/design/execution/index_en.rst
+++ b/doc/fluid/design/execution/index_en.rst
--- a/doc/fluid/design/execution/switch.md
+++ b/doc/fluid/design/execution/switch.md
--- a/doc/fluid/design/index_cn.rst
+++ b/doc/fluid/design/index_cn.rst
--- a/doc/fluid/design/index_en.rst
+++ b/doc/fluid/design/index_en.rst
--- a/doc/fluid/design/interface/index_cn.rst
+++ b/doc/fluid/design/interface/index_cn.rst
--- a/doc/fluid/design/interface/index_en.rst
+++ b/doc/fluid/design/interface/index_en.rst
--- a/doc/fluid/design/ir/overview.md
+++ b/doc/fluid/design/ir/overview.md
--- a/doc/fluid/design/memory/README.md
+++ b/doc/fluid/design/memory/README.md
--- a/doc/fluid/design/memory/images/control_flow_graph.png
+++ b/doc/fluid/design/memory/images/control_flow_graph.png
--- a/doc/fluid/design/memory/images/dataflow_equations.png
+++ b/doc/fluid/design/memory/images/dataflow_equations.png
--- a/doc/fluid/design/memory/images/deep_learning.png
+++ b/doc/fluid/design/memory/images/deep_learning.png
--- a/doc/fluid/design/memory/index_cn.rst
+++ b/doc/fluid/design/memory/index_cn.rst
--- a/doc/fluid/design/memory/index_en.rst
+++ b/doc/fluid/design/memory/index_en.rst
--- a/doc/fluid/design/memory/memory_optimization.md
+++ b/doc/fluid/design/memory/memory_optimization.md
--- a/doc/fluid/design/modules/backward.md
+++ b/doc/fluid/design/modules/backward.md
--- a/doc/fluid/design/modules/batch_norm_op.md
+++ b/doc/fluid/design/modules/batch_norm_op.md
--- a/doc/fluid/design/modules/evaluator.md
+++ b/doc/fluid/design/modules/evaluator.md
--- a/doc/fluid/design/modules/images/batch_norm_fork.dot
+++ b/doc/fluid/design/modules/images/batch_norm_fork.dot
--- a/doc/fluid/design/modules/images/batch_norm_fork.png
+++ b/doc/fluid/design/modules/images/batch_norm_fork.png
--- a/doc/fluid/design/modules/images/batch_norm_op_kernel.png
+++ b/doc/fluid/design/modules/images/batch_norm_op_kernel.png
--- a/doc/fluid/design/modules/images/feed_forward.png
+++ b/doc/fluid/design/modules/images/feed_forward.png
--- a/doc/fluid/design/modules/images/feed_forward_regularized.png
+++ b/doc/fluid/design/modules/images/feed_forward_regularized.png
--- a/doc/fluid/design/modules/images/l1_regularization.png
+++ b/doc/fluid/design/modules/images/l1_regularization.png
--- a/doc/fluid/design/modules/images/l2_regularization.png
+++ b/doc/fluid/design/modules/images/l2_regularization.png
--- a/doc/fluid/design/modules/images/loss_equation.png
+++ b/doc/fluid/design/modules/images/loss_equation.png
--- a/doc/fluid/design/modules/index_cn.rst
+++ b/doc/fluid/design/modules/index_cn.rst
--- a/doc/fluid/design/modules/index_en.rst
+++ b/doc/fluid/design/modules/index_en.rst
--- a/doc/fluid/design/modules/infer_var_type.md
+++ b/doc/fluid/design/modules/infer_var_type.md
--- a/doc/fluid/design/modules/net_op_design.md
+++ b/doc/fluid/design/modules/net_op_design.md
--- a/doc/fluid/design/modules/optimizer.md
+++ b/doc/fluid/design/modules/optimizer.md
--- a/doc/fluid/design/modules/prune.md
+++ b/doc/fluid/design/modules/prune.md
--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
--- a/doc/fluid/design/modules/register_grad_op.md
+++ b/doc/fluid/design/modules/register_grad_op.md
--- a/doc/fluid/design/modules/regularization.md
+++ b/doc/fluid/design/modules/regularization.md
--- a/doc/fluid/design/modules/selected_rows.md
+++ b/doc/fluid/design/modules/selected_rows.md
--- a/doc/fluid/design/motivation/api.md
+++ b/doc/fluid/design/motivation/api.md
--- a/doc/fluid/design/motivation/fluid-compiler.graffle
+++ b/doc/fluid/design/motivation/fluid-compiler.graffle
--- a/doc/fluid/design/motivation/fluid-compiler.png
+++ b/doc/fluid/design/motivation/fluid-compiler.png
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
--- a/doc/fluid/design/motivation/fluid_compiler.md
+++ b/doc/fluid/design/motivation/fluid_compiler.md
--- a/doc/fluid/design/motivation/index_cn.rst
+++ b/doc/fluid/design/motivation/index_cn.rst
--- a/doc/fluid/design/motivation/index_en.rst
+++ b/doc/fluid/design/motivation/index_en.rst
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
--- a/doc/fluid/design/multi_devices/index_cn.rst
+++ b/doc/fluid/design/multi_devices/index_cn.rst
--- a/doc/fluid/design/multi_devices/index_en.rst
+++ b/doc/fluid/design/multi_devices/index_en.rst
--- a/doc/fluid/design/multi_devices/kernel_hint_design.md
+++ b/doc/fluid/design/multi_devices/kernel_hint_design.md
--- a/doc/fluid/design/multi_devices/kernel_selection.md
+++ b/doc/fluid/design/multi_devices/kernel_selection.md
--- a/doc/fluid/design/multi_devices/operator_kernel_type.md
+++ b/doc/fluid/design/multi_devices/operator_kernel_type.md
--- a/doc/fluid/design/network/deep_speech_2.md
+++ b/doc/fluid/design/network/deep_speech_2.md
--- a/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg
+++ b/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg
--- a/doc/fluid/design/network/images/beam_search.png
+++ b/doc/fluid/design/network/images/beam_search.png
--- a/doc/fluid/design/network/images/ds2_network.png
+++ b/doc/fluid/design/network/images/ds2_network.png
--- a/doc/fluid/design/network/index_cn.rst
+++ b/doc/fluid/design/network/index_cn.rst
--- a/doc/fluid/design/network/index_en.rst
+++ b/doc/fluid/design/network/index_en.rst
--- a/doc/fluid/design/network/sequence_decoder.md
+++ b/doc/fluid/design/network/sequence_decoder.md
--- a/doc/fluid/design/onnx/images/project_structure.png
+++ b/doc/fluid/design/onnx/images/project_structure.png
--- a/doc/fluid/design/onnx/onnx_convertor.md
+++ b/doc/fluid/design/onnx/onnx_convertor.md
--- a/doc/fluid/design/others/auto_gradient_check.md
+++ b/doc/fluid/design/others/auto_gradient_check.md
--- a/doc/fluid/design/others/dcgan.png
+++ b/doc/fluid/design/others/dcgan.png
--- a/doc/fluid/design/others/gan_api.md
+++ b/doc/fluid/design/others/gan_api.md
--- a/doc/fluid/design/others/graph.md
+++ b/doc/fluid/design/others/graph.md
--- a/doc/fluid/design/others/graph_survey.md
+++ b/doc/fluid/design/others/graph_survey.md
--- a/doc/fluid/design/others/images/graph_construction_example.bash
+++ b/doc/fluid/design/others/images/graph_construction_example.bash
--- a/doc/fluid/design/others/images/graph_construction_example.dot
+++ b/doc/fluid/design/others/images/graph_construction_example.dot
--- a/doc/fluid/design/others/images/graph_construction_example_all.png
+++ b/doc/fluid/design/others/images/graph_construction_example_all.png
--- a/doc/fluid/design/others/images/graph_construction_example_forward_backward.png
+++ b/doc/fluid/design/others/images/graph_construction_example_forward_backward.png
--- a/doc/fluid/design/others/images/graph_construction_example_forward_only.png
+++ b/doc/fluid/design/others/images/graph_construction_example_forward_only.png
--- a/doc/fluid/design/others/parameters_in_cpp.md
+++ b/doc/fluid/design/others/parameters_in_cpp.md
--- a/doc/fluid/design/others/simple_op_design.md
+++ b/doc/fluid/design/others/simple_op_design.md
--- a/doc/fluid/design/others/test.dot
+++ b/doc/fluid/design/others/test.dot
--- a/doc/fluid/design/others/test.dot.png
+++ b/doc/fluid/design/others/test.dot.png
--- a/doc/fluid/design/quantization/fixed_point_quantization.md
+++ b/doc/fluid/design/quantization/fixed_point_quantization.md
--- a/doc/fluid/design/quantization/quantization_backward_and_optimization.png
+++ b/doc/fluid/design/quantization/quantization_backward_and_optimization.png
--- a/doc/fluid/design/quantization/quantization_equivalent_forward.png
+++ b/doc/fluid/design/quantization/quantization_equivalent_forward.png
--- a/doc/fluid/design/quantization/quantization_forward.png
+++ b/doc/fluid/design/quantization/quantization_forward.png
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ b/doc/fluid/dev/api_doc_std_cn.md
--- a/doc/fluid/dev/api_doc_std_en.md
+++ b/doc/fluid/dev/api_doc_std_en.md
--- a/doc/fluid/dev/ci_build_whl.png
+++ b/doc/fluid/dev/ci_build_whl.png
--- a/doc/fluid/dev/contribute_to_paddle_cn.md
+++ b/doc/fluid/dev/contribute_to_paddle_cn.md
--- a/doc/fluid/dev/contribute_to_paddle_en.md
+++ b/doc/fluid/dev/contribute_to_paddle_en.md
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
--- a/doc/fluid/dev/name_convention.md
+++ b/doc/fluid/dev/name_convention.md
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
--- a/doc/fluid/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
--- a/doc/fluid/dev/new_op_kernel.md
+++ b/doc/fluid/dev/new_op_kernel.md
--- a/doc/fluid/dev/op_markdown_format.md
+++ b/doc/fluid/dev/op_markdown_format.md
--- a/doc/fluid/dev/releasing_process_cn.md
+++ b/doc/fluid/dev/releasing_process_cn.md
--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
--- a/doc/fluid/dev/src/fc.py
+++ b/doc/fluid/dev/src/fc.py
--- a/doc/fluid/dev/support_new_device.md
+++ b/doc/fluid/dev/support_new_device.md
--- a/doc/fluid/dev/use_eigen_cn.md
+++ b/doc/fluid/dev/use_eigen_cn.md
--- a/doc/fluid/dev/use_eigen_en.md
+++ b/doc/fluid/dev/use_eigen_en.md
--- a/doc/fluid/dev/write_docs_cn.rst
+++ b/doc/fluid/dev/write_docs_cn.rst
--- a/doc/fluid/dev/write_docs_en.rst
+++ b/doc/fluid/dev/write_docs_en.rst
--- a/doc/fluid/faq/index_cn.rst
+++ b/doc/fluid/faq/index_cn.rst
--- a/doc/fluid/faq/index_en.rst
+++ b/doc/fluid/faq/index_en.rst
--- a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
+++ b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
--- a/doc/fluid/getstarted/concepts/index_cn.rst
+++ b/doc/fluid/getstarted/concepts/index_cn.rst
--- a/doc/fluid/getstarted/concepts/index_en.rst
+++ b/doc/fluid/getstarted/concepts/index_en.rst
--- a/doc/fluid/getstarted/concepts/reader/README.md
+++ b/doc/fluid/getstarted/concepts/reader/README.md
--- a/doc/fluid/getstarted/concepts/save_model/model_format.md
+++ b/doc/fluid/getstarted/concepts/save_model/model_format.md
--- a/doc/fluid/getstarted/index_cn.rst
+++ b/doc/fluid/getstarted/index_cn.rst
--- a/doc/fluid/getstarted/index_en.rst
+++ b/doc/fluid/getstarted/index_en.rst
--- a/doc/fluid/getstarted/quickstart_cn.rst
+++ b/doc/fluid/getstarted/quickstart_cn.rst
--- a/doc/fluid/getstarted/quickstart_en.rst
+++ b/doc/fluid/getstarted/quickstart_en.rst
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
--- a/doc/fluid/howto/cluster/fluid_cluster_train_en.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_en.md
--- a/doc/fluid/howto/cluster/fluid_recordio.md
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
--- a/doc/fluid/howto/cluster/nccl2_rdma_training.md
+++ b/doc/fluid/howto/cluster/nccl2_rdma_training.md
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
--- a/doc/fluid/howto/index_en.rst
+++ b/doc/fluid/howto/index_en.rst
--- a/doc/fluid/howto/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
--- a/doc/fluid/howto/inference/index_cn.rst
+++ b/doc/fluid/howto/inference/index_cn.rst
--- a/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
+++ b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
--- a/doc/fluid/howto/optimization/benchmark/index_cn.rst
+++ b/doc/fluid/howto/optimization/benchmark/index_cn.rst
--- a/doc/fluid/howto/optimization/benchmark/index_en.rst
+++ b/doc/fluid/howto/optimization/benchmark/index_en.rst
--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
--- a/doc/fluid/howto/optimization/host_memory_profiling_cn.md
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
--- a/doc/fluid/howto/optimization/index_cn.rst
+++ b/doc/fluid/howto/optimization/index_cn.rst
--- a/doc/fluid/howto/optimization/index_en.rst
+++ b/doc/fluid/howto/optimization/index_en.rst
--- a/doc/fluid/howto/optimization/pprof_1.png
+++ b/doc/fluid/howto/optimization/pprof_1.png
--- a/doc/fluid/howto/optimization/pprof_2.png
+++ b/doc/fluid/howto/optimization/pprof_2.png
--- a/doc/fluid/howto/optimization/timeline.jpeg
+++ b/doc/fluid/howto/optimization/timeline.jpeg
--- a/doc/fluid/howto/optimization/timeline_cn.md
+++ b/doc/fluid/howto/optimization/timeline_cn.md
--- a/doc/fluid/howto/optimization/timeline_en.md
+++ b/doc/fluid/howto/optimization/timeline_en.md
--- a/doc/fluid/howto/optimization/tracing.jpeg
+++ b/doc/fluid/howto/optimization/tracing.jpeg
--- a/doc/fluid/howto/performance/error_clip.md
+++ b/doc/fluid/howto/performance/error_clip.md
--- a/doc/fluid/howto/performance/images/profiler.png
+++ b/doc/fluid/howto/performance/images/profiler.png
--- a/doc/fluid/howto/performance/profiler.md
+++ b/doc/fluid/howto/performance/profiler.md
--- a/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle
+++ b/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle
--- a/doc/fluid/howto/third_party/images/multigpu_allreduce.png
+++ b/doc/fluid/howto/third_party/images/multigpu_allreduce.png
--- a/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle
+++ b/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle
--- a/doc/fluid/howto/third_party/images/multigpu_before_convert.png
+++ b/doc/fluid/howto/third_party/images/multigpu_before_convert.png
--- a/doc/fluid/howto/third_party/mkldnn_fluid.md
+++ b/doc/fluid/howto/third_party/mkldnn_fluid.md
--- a/doc/fluid/howto/third_party/paddle_nccl.md
+++ b/doc/fluid/howto/third_party/paddle_nccl.md
--- a/doc/fluid/images/1.png
+++ b/doc/fluid/images/1.png
--- a/doc/fluid/images/2.png
+++ b/doc/fluid/images/2.png
--- a/doc/fluid/images/2_level_rnn.dot
+++ b/doc/fluid/images/2_level_rnn.dot
--- a/doc/fluid/images/2_level_rnn.png
+++ b/doc/fluid/images/2_level_rnn.png
--- a/doc/fluid/images/3.png
+++ b/doc/fluid/images/3.png
--- a/doc/fluid/images/4.png
+++ b/doc/fluid/images/4.png
--- a/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg
+++ b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg
--- a/doc/fluid/images/LoDTensor.png
+++ b/doc/fluid/images/LoDTensor.png
--- a/doc/fluid/images/asgd.gif
+++ b/doc/fluid/images/asgd.gif
--- a/doc/fluid/images/batch_norm_fork.dot
+++ b/doc/fluid/images/batch_norm_fork.dot
--- a/doc/fluid/images/batch_norm_fork.png
+++ b/doc/fluid/images/batch_norm_fork.png
--- a/doc/fluid/images/batch_norm_op_kernel.png
+++ b/doc/fluid/images/batch_norm_op_kernel.png
--- a/doc/fluid/images/beam_search.png
+++ b/doc/fluid/images/beam_search.png
--- a/doc/fluid/images/ci_build_whl.png
+++ b/doc/fluid/images/ci_build_whl.png
--- a/doc/fluid/images/compile_run_time.png
+++ b/doc/fluid/images/compile_run_time.png
--- a/doc/fluid/images/compiler.graffle
+++ b/doc/fluid/images/compiler.graffle
--- a/doc/fluid/images/compiler.png
+++ b/doc/fluid/images/compiler.png
--- a/doc/fluid/images/control_flow_graph.png
+++ b/doc/fluid/images/control_flow_graph.png
--- a/doc/fluid/images/dataflow_equations.png
+++ b/doc/fluid/images/dataflow_equations.png
--- a/doc/fluid/images/dcgan.png
+++ b/doc/fluid/images/dcgan.png
--- a/doc/fluid/images/deep_learning.png
+++ b/doc/fluid/images/deep_learning.png
--- a/doc/fluid/images/dist-graph.graffle
+++ b/doc/fluid/images/dist-graph.graffle
--- a/doc/fluid/images/dist-graph.png
+++ b/doc/fluid/images/dist-graph.png
--- a/doc/fluid/images/distributed_architecture.graffle
+++ b/doc/fluid/images/distributed_architecture.graffle
--- a/doc/fluid/images/distributed_architecture.png
+++ b/doc/fluid/images/distributed_architecture.png
--- a/doc/fluid/images/ds2_network.png
+++ b/doc/fluid/images/ds2_network.png
--- a/doc/fluid/images/executor.png
+++ b/doc/fluid/images/executor.png
--- a/doc/fluid/images/feed_forward.png
+++ b/doc/fluid/images/feed_forward.png
--- a/doc/fluid/images/feed_forward_regularized.png
+++ b/doc/fluid/images/feed_forward_regularized.png
--- a/doc/fluid/images/fluid-compiler.graffle
+++ b/doc/fluid/images/fluid-compiler.graffle
--- a/doc/fluid/images/fluid-compiler.png
+++ b/doc/fluid/images/fluid-compiler.png
--- a/doc/fluid/images/fluid_examples.png
+++ b/doc/fluid/images/fluid_examples.png
--- a/doc/fluid/images/fluid_module_1.png
+++ b/doc/fluid/images/fluid_module_1.png
--- a/doc/fluid/images/fluid_module_2.png
+++ b/doc/fluid/images/fluid_module_2.png
--- a/doc/fluid/images/graph_construction_example.bash
+++ b/doc/fluid/images/graph_construction_example.bash
--- a/doc/fluid/images/graph_construction_example.dot
+++ b/doc/fluid/images/graph_construction_example.dot
--- a/doc/fluid/images/graph_construction_example_all.png
+++ b/doc/fluid/images/graph_construction_example_all.png
--- a/doc/fluid/images/graph_construction_example_forward_backward.png
+++ b/doc/fluid/images/graph_construction_example_forward_backward.png
--- a/doc/fluid/images/graph_construction_example_forward_only.png
+++ b/doc/fluid/images/graph_construction_example_forward_only.png
--- a/doc/fluid/images/l1_regularization.png
+++ b/doc/fluid/images/l1_regularization.png
--- a/doc/fluid/images/l2_regularization.png
+++ b/doc/fluid/images/l2_regularization.png
--- a/doc/fluid/images/layer.png
+++ b/doc/fluid/images/layer.png
--- a/doc/fluid/images/local-graph.graffle
+++ b/doc/fluid/images/local-graph.graffle
--- a/doc/fluid/images/local-graph.png
+++ b/doc/fluid/images/local-graph.png
--- a/doc/fluid/images/local_architecture.graffle
+++ b/doc/fluid/images/local_architecture.graffle
--- a/doc/fluid/images/local_architecture.png
+++ b/doc/fluid/images/local_architecture.png
--- a/doc/fluid/images/lookup_table.png
+++ b/doc/fluid/images/lookup_table.png
--- a/doc/fluid/images/lookup_table_training.png
+++ b/doc/fluid/images/lookup_table_training.png
--- a/doc/fluid/images/loss_equation.png
+++ b/doc/fluid/images/loss_equation.png
--- a/doc/fluid/images/multi-threads.graffle
+++ b/doc/fluid/images/multi-threads.graffle
--- a/doc/fluid/images/multi-threads@3x.png
+++ b/doc/fluid/images/multi-threads@3x.png
--- a/doc/fluid/images/multigpu_allreduce.graffle
+++ b/doc/fluid/images/multigpu_allreduce.graffle
--- a/doc/fluid/images/multigpu_allreduce.png
+++ b/doc/fluid/images/multigpu_allreduce.png
--- a/doc/fluid/images/multigpu_before_convert.graffle
+++ b/doc/fluid/images/multigpu_before_convert.graffle
--- a/doc/fluid/images/multigpu_before_convert.png
+++ b/doc/fluid/images/multigpu_before_convert.png
--- a/doc/fluid/images/multiple_reader.png
+++ b/doc/fluid/images/multiple_reader.png
--- a/doc/fluid/images/op.dot
+++ b/doc/fluid/images/op.dot
--- a/doc/fluid/images/op_op_with_kern_class_diagram.dot
+++ b/doc/fluid/images/op_op_with_kern_class_diagram.dot
--- a/doc/fluid/images/op_with_kernel.dot
+++ b/doc/fluid/images/op_with_kernel.dot
--- a/doc/fluid/images/operator1.png
+++ b/doc/fluid/images/operator1.png
--- a/doc/fluid/images/operator2.png
+++ b/doc/fluid/images/operator2.png
--- a/doc/fluid/images/paddle-compile.graffle
+++ b/doc/fluid/images/paddle-compile.graffle
--- a/doc/fluid/images/paddle-compile.png
+++ b/doc/fluid/images/paddle-compile.png
--- a/doc/fluid/images/place.png
+++ b/doc/fluid/images/place.png
--- a/doc/fluid/images/pprof_1.png
+++ b/doc/fluid/images/pprof_1.png
--- a/doc/fluid/images/pprof_2.png
+++ b/doc/fluid/images/pprof_2.png
--- a/doc/fluid/images/print_fluid_program.png
+++ b/doc/fluid/images/print_fluid_program.png
--- a/doc/fluid/images/profiler.png
+++ b/doc/fluid/images/profiler.png
--- a/doc/fluid/images/program_desc1.png
+++ b/doc/fluid/images/program_desc1.png
--- a/doc/fluid/images/program_desc2.png
+++ b/doc/fluid/images/program_desc2.png
--- a/doc/fluid/images/raw_input.png
+++ b/doc/fluid/images/raw_input.png
--- a/doc/fluid/images/readers.png
+++ b/doc/fluid/images/readers.png
--- a/doc/fluid/images/remote_executor.graffle
+++ b/doc/fluid/images/remote_executor.graffle
--- a/doc/fluid/images/remote_executor.png
+++ b/doc/fluid/images/remote_executor.png
--- a/doc/fluid/images/rnn.dot
+++ b/doc/fluid/images/rnn.dot
--- a/doc/fluid/images/rnn.jpg
+++ b/doc/fluid/images/rnn.jpg
--- a/doc/fluid/images/rnn.png
+++ b/doc/fluid/images/rnn.png
--- a/doc/fluid/images/rnn_2level_data.dot
+++ b/doc/fluid/images/rnn_2level_data.dot
--- a/doc/fluid/images/rnn_2level_data.png
+++ b/doc/fluid/images/rnn_2level_data.png
--- a/doc/fluid/images/scope_variable_tensor.png
+++ b/doc/fluid/images/scope_variable_tensor.png
--- a/doc/fluid/images/single-thread@3x.png
+++ b/doc/fluid/images/single-thread@3x.png
--- a/doc/fluid/images/sorted_input.png
+++ b/doc/fluid/images/sorted_input.png
--- a/doc/fluid/images/sparse_update.graffle
+++ b/doc/fluid/images/sparse_update.graffle
--- a/doc/fluid/images/sparse_update.png
+++ b/doc/fluid/images/sparse_update.png
--- a/doc/fluid/images/test.dot
+++ b/doc/fluid/images/test.dot
--- a/doc/fluid/images/test.dot.png
+++ b/doc/fluid/images/test.dot.png
--- a/doc/fluid/images/theta_star.gif
+++ b/doc/fluid/images/theta_star.gif
--- a/doc/fluid/images/timeline.jpeg
+++ b/doc/fluid/images/timeline.jpeg
--- a/doc/fluid/images/tracing.jpeg
+++ b/doc/fluid/images/tracing.jpeg
--- a/doc/fluid/images/transpiler.png
+++ b/doc/fluid/images/transpiler.png
--- a/doc/fluid/images/user_interface.png
+++ b/doc/fluid/images/user_interface.png
--- a/doc/fluid/index_cn.rst
+++ b/doc/fluid/index_cn.rst
--- a/doc/fluid/index_en.rst
+++ b/doc/fluid/index_en.rst
--- a/doc/fluid/new_docs/advanced_usage/benchmark.rst
+++ b/doc/fluid/new_docs/advanced_usage/benchmark.rst
--- a/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
--- a/doc/fluid/new_docs/advanced_usage/deploy/index_mobile.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_mobile.rst
--- a/doc/fluid/new_docs/advanced_usage/deploy/install_anakin.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/install_anakin.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/mobile_build.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/mobile_build.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/mobile_dev.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/mobile_dev.md
--- a/doc/fluid/new_docs/advanced_usage/deploy/run_anakin_on_arm.md
+++ b/doc/fluid/new_docs/advanced_usage/deploy/run_anakin_on_arm.md
--- a/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md
+++ b/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md
--- a/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md
+++ b/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md
--- a/doc/fluid/new_docs/advanced_usage/development/gpu_profiling_cn.rst
+++ b/doc/fluid/new_docs/advanced_usage/development/gpu_profiling_cn.rst
--- a/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md
+++ b/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md
--- a/doc/fluid/new_docs/advanced_usage/development/new_op.md
+++ b/doc/fluid/new_docs/advanced_usage/development/new_op.md
--- a/doc/fluid/new_docs/advanced_usage/development/nvvp1.png
+++ b/doc/fluid/new_docs/advanced_usage/development/nvvp1.png
--- a/doc/fluid/new_docs/advanced_usage/development/nvvp2.png
+++ b/doc/fluid/new_docs/advanced_usage/development/nvvp2.png
--- a/doc/fluid/new_docs/advanced_usage/development/nvvp3.png
+++ b/doc/fluid/new_docs/advanced_usage/development/nvvp3.png
--- a/doc/fluid/new_docs/advanced_usage/development/nvvp4.png
+++ b/doc/fluid/new_docs/advanced_usage/development/nvvp4.png
--- a/doc/fluid/new_docs/advanced_usage/development/pprof_1.png
+++ b/doc/fluid/new_docs/advanced_usage/development/pprof_1.png
--- a/doc/fluid/new_docs/advanced_usage/development/pprof_2.png
+++ b/doc/fluid/new_docs/advanced_usage/development/pprof_2.png
--- a/doc/fluid/new_docs/advanced_usage/development/timeline.jpeg
+++ b/doc/fluid/new_docs/advanced_usage/development/timeline.jpeg
--- a/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md
+++ b/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md
--- a/doc/fluid/new_docs/advanced_usage/development/tracing.jpeg
+++ b/doc/fluid/new_docs/advanced_usage/development/tracing.jpeg
--- a/doc/fluid/new_docs/advanced_usage/development/write_docs.rst
+++ b/doc/fluid/new_docs/advanced_usage/development/write_docs.rst
--- a/doc/fluid/new_docs/advanced_usage/index.rst
+++ b/doc/fluid/new_docs/advanced_usage/index.rst
--- a/doc/fluid/new_docs/advanced_usage/pics/anakin_fm_ch.png
+++ b/doc/fluid/new_docs/advanced_usage/pics/anakin_fm_ch.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/basics/index.rst
+++ b/doc/fluid/new_docs/beginners_guide/basics/index.rst
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/basics/learning_materials.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/learning_materials.md
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/index.rst
+++ b/doc/fluid/new_docs/beginners_guide/index.rst
--- a/doc/fluid/new_docs/beginners_guide/install/install_doc.rst
+++ b/doc/fluid/new_docs/beginners_guide/install/install_doc.rst
--- a/doc/fluid/new_docs/beginners_guide/install/paddleci.png
+++ b/doc/fluid/new_docs/beginners_guide/install/paddleci.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
--- a/doc/fluid/new_docs/beginners_guide/quick_start/index.rst
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/index.rst
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
--- a/doc/fluid/new_docs/faq/faq.rst
+++ b/doc/fluid/new_docs/faq/faq.rst
--- a/doc/fluid/new_docs/faq/index_cn.rst
+++ b/doc/fluid/new_docs/faq/index_cn.rst
--- a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_basic_concept.rst
+++ b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_basic_concept.rst
--- a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_local_train.jpeg
+++ b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_local_train.jpeg
--- a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_mnist.png
+++ b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_mnist.png
--- a/doc/fluid/new_docs/user_guides/howto/configure_simple_model/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/configure_simple_model/index.rst
--- a/doc/fluid/new_docs/user_guides/howto/debug/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/debug/index.rst
--- a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
+++ b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
--- a/doc/fluid/new_docs/user_guides/howto/evaluation/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/evaluation/index.rst
--- a/doc/fluid/new_docs/user_guides/howto/evaluation/metrics.rst
+++ b/doc/fluid/new_docs/user_guides/howto/evaluation/metrics.rst
--- a/doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst
--- a/doc/fluid/new_docs/user_guides/howto/inference/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/inference/index.rst
--- a/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
+++ b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
--- a/doc/fluid/new_docs/user_guides/howto/modification/foo.rst
+++ b/doc/fluid/new_docs/user_guides/howto/modification/foo.rst
--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/feeding_data.rst
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/feeding_data.rst
--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/reader.md
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/reader.md
--- a/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_cn.md
+++ b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_cn.md
--- a/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_en.md
+++ b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_en.md
--- a/doc/fluid/new_docs/user_guides/howto/training/cluster_howto.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/cluster_howto.rst
--- a/doc/fluid/new_docs/user_guides/howto/training/cluster_quick_start.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/cluster_quick_start.rst
--- a/doc/fluid/new_docs/user_guides/howto/training/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/index.rst
--- a/doc/fluid/new_docs/user_guides/howto/training/multi_node.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/multi_node.rst
--- a/doc/fluid/new_docs/user_guides/howto/training/save_load_variables.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/save_load_variables.rst
--- a/doc/fluid/new_docs/user_guides/howto/training/single_node.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/single_node.rst
--- a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.graffle
+++ b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.graffle
--- a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.png
+++ b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.png
--- a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.graffle
+++ b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.graffle
--- a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.png
+++ b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.png
--- a/doc/fluid/new_docs/user_guides/howto/training/src/parallelism.png
+++ b/doc/fluid/new_docs/user_guides/howto/training/src/parallelism.png
--- a/doc/fluid/new_docs/user_guides/howto/training/test_while_training.rst
+++ b/doc/fluid/new_docs/user_guides/howto/training/test_while_training.rst
--- a/doc/fluid/new_docs/user_guides/index.rst
+++ b/doc/fluid/new_docs/user_guides/index.rst
--- a/doc/fluid/new_docs/user_guides/models/index.rst
+++ b/doc/fluid/new_docs/user_guides/models/index.rst
--- a/doc/fluid/read_source.md
+++ b/doc/fluid/read_source.md
--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
--- a/paddle/fluid/framework/details/reference_count_op_handle.h
+++ b/paddle/fluid/framework/details/reference_count_op_handle.h
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
--- a/paddle/fluid/framework/details/reference_count_pass.h
+++ b/paddle/fluid/framework/details/reference_count_pass.h
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
--- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
--- a/paddle/fluid/framework/prune.h
+++ b/paddle/fluid/framework/prune.h
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
--- a/paddle/fluid/framework/version.cc
+++ b/paddle/fluid/framework/version.cc
--- a/paddle/fluid/framework/version.h
+++ b/paddle/fluid/framework/version.h
--- a/paddle/fluid/framework/version_test.cc
+++ b/paddle/fluid/framework/version_test.cc
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/pass.cc
+++ b/paddle/fluid/inference/analysis/pass.cc
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
--- a/paddle/fluid/inference/analysis/flags.h
+++ b/paddle/fluid/inference/analysis/flags.h
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
--- a/paddle/fluid/inference/analysis/model_store_pass.h
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ b/paddle/fluid/inference/analysis/pass_manager.h
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
--- a/paddle/fluid/inference/api/demo_ci/windows_inference.md
+++ b/paddle/fluid/inference/api/demo_ci/windows_inference.md
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
--- a/paddle/fluid/inference/paddle_fluid.map
+++ b/paddle/fluid/inference/paddle_fluid.map
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
--- a/paddle/fluid/inference/api/api_anakin_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
--- a/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
--- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
--- a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
--- a/paddle/fluid/operators/detail/safe_ref.h
+++ b/paddle/fluid/operators/detail/safe_ref.h
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
--- a/paddle/fluid/operators/distributed/varhandle_test.cc
+++ b/paddle/fluid/operators/distributed/varhandle_test.cc
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
--- a/paddle/fluid/operators/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc
--- a/paddle/fluid/operators/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.h
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/compound_functors.h
+++ b/paddle/fluid/operators/math/compound_functors.h
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
--- a/paddle/fluid/operators/sequence_softmax_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc
--- a/paddle/fluid/operators/math/cpu_lstm_compute.h
+++ b/paddle/fluid/operators/math/cpu_lstm_compute.h
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
--- a/paddle/fluid/operators/math/functors.h
+++ b/paddle/fluid/operators/math/functors.h
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
--- a/paddle/fluid/operators/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cu.cc
--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
--- a/paddle/fluid/operators/sequence_expand_as_op.cc
+++ b/paddle/fluid/operators/sequence_expand_as_op.cc
--- a/paddle/fluid/operators/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_expand_as_op.cu
--- a/paddle/fluid/operators/sequence_expand_as_op.h
+++ b/paddle/fluid/operators/sequence_expand_as_op.h
--- a/paddle/fluid/operators/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_mask_op.cc
--- a/paddle/fluid/operators/sequence_mask_op.cu
+++ b/paddle/fluid/operators/sequence_mask_op.cu
--- a/paddle/fluid/operators/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_pad_op.cc
--- a/paddle/fluid/operators/sequence_pad_op.h
+++ b/paddle/fluid/operators/sequence_pad_op.h
--- a/paddle/fluid/operators/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_scatter_op.cc
--- a/paddle/fluid/operators/sequence_scatter_op.h
+++ b/paddle/fluid/operators/sequence_scatter_op.h
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_softmax_op.cu
--- a/paddle/fluid/operators/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_softmax_op.h
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/cudnn_helper_test.cc
+++ b/paddle/fluid/platform/cudnn_helper_test.cc
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
--- a/paddle/fluid/string/pretty_log.cc
+++ b/paddle/fluid/string/pretty_log.cc
--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
--- a/python/paddle/fluid/tests/unittests/test_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
--- a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals.py
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
--- a/python/paddle/fluid/tests/unittests/test_preprocessor.py
+++ b/python/paddle/fluid/tests/unittests/test_preprocessor.py
--- a/python/paddle/fluid/tests/unittests/test_program.py
+++ b/python/paddle/fluid/tests/unittests/test_program.py
--- a/python/paddle/fluid/tests/unittests/test_program_code.py
+++ b/python/paddle/fluid/tests/unittests/test_program_code.py
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
--- a/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_concat.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_concat.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_scatter_op.py
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
--- a/python/paddle/fluid/transpiler/details/checkport.py
+++ b/python/paddle/fluid/transpiler/details/checkport.py
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
--- a/python/requirements.txt
+++ b/python/requirements.txt