Merge remote-tracking branch 'ups/develop' into nlp

1b8b253e · tensor-tang · 98fb8e58 · c509c825 · 1b8b253e · 1b8b253e
168 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,7 +41,6 @@ option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FO
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
-option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
@@ -59,7 +58,6 @@ option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
-option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -100,6 +98,9 @@ endif()
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
  "A path setting third party libraries download & build directories.")

+set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
+  "A path setting fluid shared and static libraries")
+
 if (WITH_C_API AND WITH_PYTHON)
  message(WARNING "It is suggest not embedded a python interpreter in Paddle "
    "when using C-API. It will give an unpredictable behavior when using a "
@@ -153,7 +154,6 @@ include(cupti)
 include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
-include(cpplint)            # set paddle c++ style
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(rdma)               # set rdma libraries

--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -38,7 +38,7 @@ def str2bool(v):

 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
+    '--batch_size', type=int, default=16, help="Batch size for training.")
 parser.add_argument(
    '--learning_rate',
    type=float,
@@ -61,7 +61,7 @@ parser.add_argument(
 parser.add_argument(
    '--data_set',
    type=str,
-    default='cifar10',
+    default='flowers',
    choices=['cifar10', 'flowers'],
    help='Optional dataset for benchmark.')
 parser.add_argument(
@@ -200,26 +200,30 @@ def main():
                    fetch_list=[avg_cost, batch_acc, batch_size])
                return loss, acc, b_size

-            if args.profile and args.task_index == 0:
-                # warmup.
-                for batch_id, data in enumerate(train_reader()):
-                    if batch_id > 5: break
-                    run_step(batch_id, data)
-                with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
+            if args.profile:
+                with profiler.profiler('All', 'total',
+                                       '/tmp/profile_vgg_%d' % args.task_index):
                    for batch_id, data in enumerate(train_reader()):
                        if batch_id > 5: break
                        run_step(batch_id, data)

+            total_time = 0.0
+            count = 0
            for batch_id, data in enumerate(train_reader()):
                ts = time.time()
                loss, acc, b_size = run_step(batch_id, data)
                iters += 1
                num_samples += len(data)
                train_pass_acc.add(value=acc, weight=b_size)
+
+                duration = time.time() - ts
+                total_time += duration
+                count += len(data)
                print(
                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
-                    "Speed = %.2f img/s" % (pass_id, iters, loss, acc,
-                                            len(data) / (time.time() - ts))
+                    "Speed = %.2f (%.2f) img/s" % (pass_id, iters, loss, acc,
+                                                   len(data) / duration,
+                                                   count / total_time)
                )  # The accuracy is the accumulation of batches, but not the current batch.

            pass_elapsed = time.time() - start_time

--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
+# Fluid Benchmark
+
+This directory contains several models configurations and tools that used to run
+Fluid benchmarks for local and distributed training.
+
+
+## Run the Benchmark
+
+To start, run the following command to get the full help message:
+
+```bash
+python fluid_benchmark.py --help
+```
+
+Currently supported `--model` argument include:
+
+* mnist
+* resnet
+    * you can chose to use different dataset using `--data_set cifar10` or
+      `--data_set flowers`.
+* vgg
+* stacked_dynamic_lstm
+* machine_translation
+
+* Run the following command to start a benchmark job locally:
+    ```bash
+      python fluid_benchmark.py --model mnist --parallel 1 --device GPU --with_test
+    ```
+    You can choose to use GPU/CPU training. With GPU training, you can specify
+    `--parallel 1` to run multi GPU training.
+* Run distributed training with parameter servers:
+    * start parameter servers:
+        ```bash
+        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
+        ```
+    * start trainers:
+        ```bash
+        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
+        ```
+* Run distributed training using NCCL2
+    ```bash
+    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method nccl2
+    ```
+
+## Run Distributed Benchmark on Kubernetes Cluster
+
+We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
+distributed benchmark jobs to your cluster. To generate a job yaml, just run:
+
+```bash
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver --with_test" --disttype pserver
+```
+
+Then the yaml files are generated under directory `myjob`, you can run:
+
+```bash
+kubectl create -f myjob/
+```
+
+The job shall start.
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import cProfile
+import time
+import os
+
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
+
+BENCHMARK_MODELS = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Fluid model benchmarks.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=BENCHMARK_MODELS,
+        default='resnet',
+        help='The model to run benchmark with.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=0.001,
+        help='The minibatch size.')
+    # TODO(wuyi): add "--use_fake_data" option back.
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=1,
+        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--no_test',
+        action='store_false',
+        help='If set, test the testset during training.')
+    parser.add_argument(
+        '--memory_optimize',
+        action='store_true',
+        help='If set, optimize runtime memory before start.')
+    parser.add_argument(
+        '--update_method',
+        type=str,
+        default='local',
+        choices=['local', 'pserver', 'nccl2'],
+        help='Choose parameter update method, can be local, pserver, nccl2.')
+    args = parser.parse_args()
+    return args
+
+
+def append_nccl2_prepare():
+    if os.getenv("PADDLE_TRAINER_ID", None) != None:
+        # append gen_nccl_id at the end of startup program
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        port = os.getenv("PADDLE_PSERVER_PORT")
+        worker_ips = os.getenv("PADDLE_TRAINER_IPS")
+        worker_endpoints = []
+        for ip in worker_ips.split(","):
+            worker_endpoints.append(':'.join([ip, port]))
+        num_trainers = len(worker_endpoints)
+        current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
+        worker_endpoints.remove(current_endpoint)
+
+        nccl_id_var = fluid.default_startup_program().global_block().create_var(
+            name="NCCLID",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+        fluid.default_startup_program().global_block().append_op(
+            type="gen_nccl_id",
+            inputs={},
+            outputs={"NCCLID": nccl_id_var},
+            attrs={
+                "endpoint": current_endpoint,
+                "endpoint_list": worker_endpoints,
+                "trainer_id": trainer_id
+            })
+        return nccl_id_var, num_trainers, trainer_id
+    else:
+        raise Exception(
+            "must set PADDLE_TRAINER_ID env variables for dist train.")
+
+
+def dist_transpile():
+    if "PADDLE_TRAINING_ROLE" not in os.environ:
+        return None, None
+
+    # the port of all pservers, needed by both trainer and pserver
+    port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+    # comma separated ips of all pservers, needed by trainer and
+    # pserver
+    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+    eplist = []
+    for ip in pserver_ips.split(","):
+        eplist.append(':'.join([ip, port]))
+    pserver_endpoints = ",".join(eplist)
+    # total number of workers/trainers in the job, needed by
+    # trainer and pserver
+    trainers = int(os.getenv("PADDLE_TRAINERS"))
+    # the IP of the local machine, needed by pserver only
+    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+    # the unique trainer id, starting from 0, needed by trainer
+    # only
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+    # the role, should be either PSERVER or TRAINER
+    training_role = os.getenv("PADDLE_TRAINING_ROLE")
+
+    t = distribute_transpiler.DistributeTranspiler()
+    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+    if training_role == "PSERVER":
+        pserver_program = t.get_pserver_program(current_endpoint)
+        pserver_startup_program = t.get_startup_program(current_endpoint,
+                                                        pserver_program)
+        return pserver_program, pserver_startup_program
+    elif training_role == "TRAINER":
+        train_program = t.get_trainer_program()
+        return train_program, fluid.default_startup_program()
+    else:
+        raise ValueError(
+            'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+        )
+
+
+def test(exe, inference_program, test_reader, feeder, batch_acc):
+    accuracy_evaluator = fluid.metrics.Accuracy()
+    for batch_id, data in enumerate(test_reader()):
+        acc = exe.run(inference_program,
+                      feed=feeder.feed(data),
+                      fetch_list=[batch_acc])
+        accuracy_evaluator.update(value=np.array(acc), weight=len(data))
+
+    return accuracy_evaluator.eval()
+
+
+# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# API once it is ready.
+def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
+          args, train_prog, startup_prog):
+    if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
+        place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(train_prog)
+        return
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+    feed_var_list = [
+        var for var in train_prog.global_block().vars.itervalues()
+        if var.is_data
+    ]
+    feeder = fluid.DataFeeder(feed_var_list, place)
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            loss = exe.run(train_prog,
+                           feed=feeder.feed(data),
+                           fetch_list=[avg_loss])
+            iters += 1
+            num_samples += len(data)
+            train_losses.append(loss)
+            print("Pass: %d, Iter: %d, Loss: %f\n" %
+                  (pass_id, iters, np.mean(train_losses)))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sec\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses)))
+        # evaluation
+        if not args.no_test and batch_acc != None:
+            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
+                                 batch_acc)
+            print(", Test Accuracy: %f" % pass_test_acc)
+        print("\n")
+        # TODO(wuyi): add warmup passes to get better perf data.
+        exit(0)
+
+
+# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# API once it is ready.
+def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
+                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
+                   num_trainers, trainer_id):
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    startup_exe = fluid.Executor(place)
+    startup_exe.run(startup_prog)
+    strategy = fluid.ExecutionStrategy()
+    strategy.num_threads = 1
+    strategy.allow_op_delay = False
+    exe = fluid.ParallelExecutor(
+        True,
+        avg_loss.name,
+        exec_strategy=strategy,
+        num_trainers=num_trainers,
+        trainer_id=trainer_id)
+    feed_var_list = [
+        var for var in train_prog.global_block().vars.itervalues()
+        if var.is_data
+    ]
+    feeder = fluid.DataFeeder(feed_var_list, place)
+    for pass_id in range(args.pass_num):
+        num_samples = 0
+        iters = 0
+        start_time = time.time()
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+            if args.update_method == "pserver":
+                exe.bcast_params()
+            num_samples += len(data)
+            iters += 1
+            if batch_id % 1 == 0:
+                print("Pass %d, batch %d, loss %s" %
+                      (pass_id, batch_id, np.array(loss)))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        if not args.no_test and batch_acc != None:
+            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
+                            batch_acc)
+            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
+        exit(0)
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('----------- resnet Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def main():
+    args = parse_args()
+    print_arguments(args)
+    nccl_id_var, num_trainers, trainer_id = None, 1, 0
+
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+    model_def = __import__("models.%s" % args.model, fromlist=["models"])
+    train_args = list(model_def.get_model(args))
+    train_args.append(args)
+    # Run optimizer.minimize(avg_loss)
+    train_args[2].minimize(train_args[0])
+    if args.memory_optimize:
+        fluid.memory_optimize(fluid.default_main_program())
+
+    if args.update_method == "pserver":
+        train_prog, startup_prog = dist_transpile()
+        if not train_prog:
+            raise Exception(
+                "Must configure correct environments to run dist train.")
+        train_args.extend([train_prog, startup_prog])
+        if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
+            train_args.extend([nccl_id_var, num_trainers, trainer_id])
+            train_parallel(*train_args)
+        train(*train_args)
+        exit(0)
+
+    # for other update methods, use default programs
+    train_args.append(fluid.default_main_program())
+    train_args.append(fluid.default_startup_program())
+
+    if args.update_method == "nccl2":
+        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare()
+    if args.gpus == 1:
+        # NOTE: parallel executor use profiler interanlly
+        if args.use_nvprof and args.device == 'GPU':
+            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+                train(*train_args)
+        else:
+            train(*train_args)
+    else:
+        if args.device == "CPU":
+            raise Exception("Only support GPU perf with parallel exe")
+        train_args.extend([nccl_id_var, num_trainers, trainer_id])
+        train_parallel(*train_args)
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import copy
+import argparse
+import random
+import os
+from kube_templates import pserver, trainer, envs
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Generate dist job yamls.')
+
+    parser.add_argument(
+        '--jobname', default="paddlejob", help='unique job name')
+    parser.add_argument(
+        '--cpu', default=1, type=int, help='CPU cores per trainer node')
+    parser.add_argument(
+        '--pscpu', default=1, type=int, help='CPU cores per pserver node')
+    parser.add_argument(
+        '--gpu', default=0, type=int, help='num of GPUs per node')
+    parser.add_argument(
+        '--image',
+        default="bootstrapper:5000/fluid_benchmark:gpu",
+        help='num of GPUs per node')
+    parser.add_argument(
+        '--pservers', default=1, type=int, help='num of pservers')
+    parser.add_argument(
+        '--trainers', default=1, type=int, help='num of trainers')
+    parser.add_argument('--memory', default=1, type=int, help='trainer memory')
+    parser.add_argument(
+        '--psmemory', default=1, type=int, help='pserver memory')
+    parser.add_argument(
+        '--port', default=30236, type=int, help='num of trainers')
+    parser.add_argument(
+        '--entry', default="python train.py", help='command to run')
+    parser.add_argument(
+        '--fluid', default=1, type=int, help='whether is fluid job')
+    parser.add_argument(
+        '--rdma', action='store_ture', help='whether mount rdma libs')
+    parser.add_argument(
+        '--disttype',
+        default="pserver",
+        type=str,
+        choices=['pserver', 'nccl2', 'local'],
+        help='pserver or nccl2 or local')
+
+    args = parser.parse_args()
+    return args
+
+
+def gen_job():
+    ps = pserver
+    tn = trainer
+    args = parse_args()
+
+    ps_container = ps["spec"]["template"]["spec"]["containers"][0]
+    tn_container = tn["spec"]["template"]["spec"]["containers"][0]
+
+    if args.fluid == 1:
+        ps_container["command"] = \
+            ["paddle_k8s", "start_fluid"]
+        tn_container["command"] = \
+            ["paddle_k8s", "start_fluid"]
+    ps["metadata"]["name"] = args.jobname + "-pserver"
+    ps["spec"]["template"]["metadata"]["labels"][
+        "paddle-job-pserver"] = args.jobname
+    tn["metadata"]["name"] = args.jobname + "-trainer"
+    tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname
+
+    ps_container["image"] = args.image
+    tn_container["image"] = args.image
+
+    ps_container["resources"]["requests"]["cpu"] = str(args.pscpu)
+    ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi"
+    ps_container["resources"]["limits"]["cpu"] = str(args.pscpu)
+    ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi"
+
+    tn_container["resources"]["requests"]["cpu"] = str(args.cpu)
+    tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi"
+    tn_container["resources"]["limits"]["cpu"] = str(args.cpu)
+    tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi"
+    if args.gpu > 0:
+        tn_container["resources"]["requests"][
+            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
+        tn_container["resources"]["limits"][
+            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
+
+    ps["spec"]["replicas"] = int(args.pservers)
+    tn["spec"]["parallelism"] = int(args.trainers)
+    tn["spec"]["completions"] = int(args.trainers)
+    ps_container["ports"][0]["name"] = "jobport-" + str(args.port)
+    ps_container["ports"][0]["containerPort"] = args.port
+    spreadport = random.randint(40000, 60000)
+    tn_container["ports"][0]["name"] = "spr-" + str(spreadport)
+    tn_container["ports"][0]["containerPort"] = spreadport
+
+    envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
+    envs.append({"name": "TRAINERS", "value": str(args.trainers)})
+    envs.append({"name": "PSERVERS", "value": str(args.pservers)})
+    envs.append({"name": "ENTRY", "value": args.entry})
+    envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
+    # NOTE: these directories below are cluster specific, please modify
+    # this settings before you run on your own cluster.
+    envs.append({
+        "name": "LD_LIBRARY_PATH",
+        "value":
+        "/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind"
+    })
+
+    volumes = [{
+        "name": "nvidia-driver",
+        "hostPath": {
+            "path": "/usr/local/nvidia/lib64"
+        }
+    }]
+    volumeMounts = [{
+        "mountPath": "/usr/local/nvidia/lib64",
+        "name": "nvidia-driver"
+    }]
+
+    if args.rdma:
+        volumes.extend([{
+            "name": "ibetc",
+            "hostPath": {
+                "path": "/etc/libibverbs.d"
+            }
+        }, {
+            "name": "iblibs",
+            "hostPath": {
+                "path": "/usr/local/rdma"
+            }
+        }, {
+            "name": "valgrind",
+            "hostPath": {
+                "path": "/usr/lib64/mlnx_ofed/valgrind"
+            }
+        }])
+        volumeMounts.extend([{
+            "mountPath": "/etc/libibverbs.d",
+            "name": "ibetc"
+        }, {
+            "mountPath": "/usr/local/rdma",
+            "name": "iblibs"
+        }, {
+            "mountPath": "/usr/lib64/mlnx_ofed/valgrind",
+            "name": "valgrind"
+        }])
+        # append shm for NCCL2
+        volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
+        volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
+
+    tn["spec"]["template"]["spec"]["volumes"] = volumes
+    tn_container["volumeMounts"] = volumeMounts
+
+    ps_container["env"] = envs
+    ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
+    tn_container["env"] = envs
+    if args.disttype == "pserver":
+        tn_container["env"].append({
+            "name": "TRAINING_ROLE",
+            "value": "TRAINER"
+        })
+    elif args.disttype == "nccl2" or args.disttype == "local":
+        # NCCL2 have no training role, set to plain WORKER
+        tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
+
+    os.mkdir(args.jobname)
+    if args.disttype == "pserver":
+        with open("%s/pserver.yaml" % args.jobname, "w") as fn:
+            yaml.dump(ps, fn)
+
+    with open("%s/trainer.yaml" % args.jobname, "w") as fn:
+        yaml.dump(tn, fn)
+
+
+if __name__ == "__main__":
+    gen_job()
--- a/benchmark/fluid/kube_templates/__init__.py
+++ b/benchmark/fluid/kube_templates/__init__.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pserver import pserver
+from trainer import trainer
+
+__all__ = ["pserver", "trainer", "envs"]
+
+envs = [
+    # envs that don't need to change
+    {
+        "name": "GLOG_v",
+        "value": "0"
+    },
+    {
+        "name": "GLOG_logtostderr",
+        "value": "1"
+    },
+    {
+        "name": "TOPOLOGY",
+        "value": ""
+    },
+    {
+        "name": "TRAINER_PACKAGE",
+        "value": "/workspace"
+    },
+    {
+        "name": "PADDLE_INIT_NICS",
+        "value": "eth2"
+    },
+    {
+        "name": "NAMESPACE",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "metadata.namespace"
+            }
+        }
+    },
+    {
+        "name": "POD_IP",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "status.podIP"
+            }
+        }
+    }
+]
--- a/benchmark/fluid/kube_templates/pserver.py
+++ b/benchmark/fluid/kube_templates/pserver.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pserver = {
+    "apiVersion": "extensions/v1beta1",
+    "kind": "ReplicaSet",
+    "metadata": {
+        "name": "jobname-pserver"
+    },
+    "spec": {
+        "replicas": 1,
+        "template": {
+            "metadata": {
+                "labels": {
+                    "paddle-job-pserver": "jobname"
+                }
+            },
+            "spec": {
+                "hostNetwork": True,
+                "imagePullSecrets": [{
+                    "name": "job-registry-secret"
+                }],
+                "containers": [{
+                    "name": "pserver",
+                    "image": "",
+                    "imagePullPolicy": "Always",
+                    "ports": [{
+                        "name": "jobport-1",
+                        "containerPort": 1
+                    }],
+                    "env": [],
+                    "command": ["paddle_k8s", "start_pserver"],
+                    "resources": {
+                        "requests": {
+                            "memory": "10Gi",
+                            "cpu": "4"
+                        },
+                        "limits": {
+                            "memory": "10Gi",
+                            "cpu": "4"
+                        }
+                    }
+                }]
+            }
+        }
+    }
+}
--- a/benchmark/fluid/kube_templates/trainer.py
+++ b/benchmark/fluid/kube_templates/trainer.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+trainer = {
+    "apiVersion": "batch/v1",
+    "kind": "Job",
+    "metadata": {
+        "name": "jobname-pserver"
+    },
+    "spec": {
+        "parallelism": 4,
+        "completions": 4,
+        "template": {
+            "metadata": {
+                "labels": {
+                    "paddle-job": "jobname"
+                }
+            },
+            "spec": {
+                "hostNetwork": True,
+                "imagePullSecrets": [{
+                    "name": "job-registry-secret"
+                }],
+                "restartPolicy": "Never",
+                "containers": [{
+                    "name": "trainer",
+                    "image": "",
+                    "imagePullPolicy": "Always",
+                    # to let container set rlimit
+                    "securityContext": {
+                        "privileged": True
+                        # TODO(wuyi): use below specific cap instead of privileged,
+                        # using privileged will cause all GPU device are visible
+                        # in the container.
+                        # "capabilities": {
+                        #     "add": ["SYS_RESOURCE"]
+                        # }
+                    },
+                    "ports": [{
+                        "name": "jobport-1",
+                        "containerPort": 1
+                    }],
+                    "env": [],
+                    "command": ["paddle_k8s", "start_trainer", "v2"],
+                    "resources": {
+                        "requests": {
+                            "memory": "10Gi",
+                            "cpu": "4",
+                        },
+                        "limits": {
+                            "memory": "10Gi",
+                            "cpu": "4",
+                        }
+                    }
+                }]
+            }
+        }
+    }
+}
--- a/benchmark/fluid/models/__init__.py
+++ b/benchmark/fluid/models/__init__.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
--- a/benchmark/fluid/machine_translation.py
+++ b/benchmark/fluid/machine_translation.py
@@ -27,74 +27,6 @@ import paddle.fluid.core as core
 import paddle.fluid.framework as framework
 from paddle.fluid.executor import Executor

-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--embedding_dim",
-    type=int,
-    default=512,
-    help="The dimension of embedding table. (default: %(default)d)")
-parser.add_argument(
-    "--encoder_size",
-    type=int,
-    default=512,
-    help="The size of encoder bi-rnn unit. (default: %(default)d)")
-parser.add_argument(
-    "--decoder_size",
-    type=int,
-    default=512,
-    help="The size of decoder rnn unit. (default: %(default)d)")
-parser.add_argument(
-    "--batch_size",
-    type=int,
-    default=16,
-    help="The sequence number of a mini-batch data. (default: %(default)d)")
-parser.add_argument(
-    '--skip_batch_num',
-    type=int,
-    default=5,
-    help='The first num of minibatch num to skip, for better performance test')
-parser.add_argument(
-    '--iterations', type=int, default=80, help='The number of minibatches.')
-parser.add_argument(
-    "--dict_size",
-    type=int,
-    default=30000,
-    help="The dictionary capacity. Dictionaries of source sequence and "
-    "target dictionary have same capacity. (default: %(default)d)")
-parser.add_argument(
-    "--pass_num",
-    type=int,
-    default=2,
-    help="The pass number to train. (default: %(default)d)")
-parser.add_argument(
-    "--learning_rate",
-    type=float,
-    default=0.0002,
-    help="Learning rate used to train the model. (default: %(default)f)")
-parser.add_argument(
-    "--infer_only", action='store_true', help="If set, run forward only.")
-parser.add_argument(
-    "--beam_size",
-    type=int,
-    default=3,
-    help="The width for beam searching. (default: %(default)d)")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='GPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument(
-    "--max_length",
-    type=int,
-    default=250,
-    help="The maximum length of sequence when doing generation. "
-    "(default: %(default)d)")
-parser.add_argument(
-    '--with_test',
-    action='store_true',
-    help='If set, test the testset during training.')
-

 def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
    def linear(inputs):
@@ -264,116 +196,37 @@ def lodtensor_to_ndarray(lod_tensor):
    return ndarray


-def train():
+def get_model(args):
+    embedding_dim = 512
+    encoder_size = 512
+    decoder_size = 512
+    dict_size = 30000
+    beam_size = 3
+    max_length = 250
    avg_cost, feeding_list = seq_to_seq_net(
-        args.embedding_dim,
-        args.encoder_size,
-        args.decoder_size,
-        args.dict_size,
-        args.dict_size,
+        embedding_dim,
+        encoder_size,
+        decoder_size,
+        dict_size,
+        dict_size,
        False,
-        beam_size=args.beam_size,
-        max_length=args.max_length)
+        beam_size=beam_size,
+        max_length=max_length)

    # clone from default main program
    inference_program = fluid.default_main_program().clone()

    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    optimizer.minimize(avg_cost)
-
-    fluid.memory_optimize(fluid.default_main_program())

    train_batch_generator = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
        batch_size=args.batch_size)

    test_batch_generator = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+            paddle.dataset.wmt14.test(dict_size), buf_size=1000),
        batch_size=args.batch_size)

-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = Executor(place)
-    exe.run(framework.default_startup_program())
-
-    def do_validation():
-        total_loss = 0.0
-        count = 0
-        for batch_id, data in enumerate(test_batch_generator()):
-            src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0]
-            trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0]
-            lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0]
-
-            fetch_outs = exe.run(inference_program,
-                                 feed={
-                                     feeding_list[0]: src_seq,
-                                     feeding_list[1]: trg_seq,
-                                     feeding_list[2]: lbl_seq
-                                 },
-                                 fetch_list=[avg_cost],
-                                 return_numpy=False)
-
-            total_loss += lodtensor_to_ndarray(fetch_outs[0])[0]
-            count += 1
-
-        return total_loss / count
-
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in xrange(args.pass_num):
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_batch_generator()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
-            num_samples += word_num
-            trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
-            num_samples += word_num
-            lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
-
-            fetch_outs = exe.run(framework.default_main_program(),
-                                 feed={
-                                     feeding_list[0]: src_seq,
-                                     feeding_list[1]: trg_seq,
-                                     feeding_list[2]: lbl_seq
-                                 },
-                                 fetch_list=[avg_cost])
-
-            iters += 1
-            loss = np.array(fetch_outs[0])
-            print(
-                "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
-            )  # The accuracy is the accumulation of batches, but not the current batch.
-
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        # evaluation
-        if args.with_test:
-            test_loss = do_validation()
-        exit(0)
-
-
-def infer():
-    pass
-
-
-def print_arguments(args):
-    print('----------- seq2seq Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    args = parser.parse_args()
-    print_arguments(args)
-    if args.infer_only:
-        infer()
-    else:
-        train()
+    return avg_cost, inference_program, optimizer, train_batch_generator, \
+           test_batch_generator, None
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import numpy as np
 import argparse
 import time
+import cProfile

 import paddle
 import paddle.fluid as fluid
@@ -31,42 +32,6 @@ DTYPE = "float32"
 # fluid.default_startup_program().random_seed = SEED


-def parse_args():
-    parser = argparse.ArgumentParser("mnist model benchmark.")
-    parser.add_argument(
-        '--batch_size', type=int, default=128, help='The minibatch size.')
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations', type=int, default=35, help='The number of minibatches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=5, help='The number of passes.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
-    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    parser.add_argument(
-        '--with_test',
-        action='store_true',
-        help='If set, test the testset during training.')
-    args = parser.parse_args()
-    return args
-
-
 def cnn_model(data):
    conv_pool_1 = fluid.nets.simple_img_conv_pool(
        input=data,
@@ -99,36 +64,13 @@ def cnn_model(data):
    return predict


-def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=args.batch_size)
-    test_pass_acc = fluid.average.WeightedAverage()
-    for batch_id, data in enumerate(test_reader()):
-        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
-                                data)).astype(DTYPE)
-        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-        y_data = y_data.reshape([len(y_data), 1])
-
-        acc, weight = exe.run(inference_program,
-                              feed={"pixel": img_data,
-                                    "label": y_data},
-                              fetch_list=[batch_acc, batch_size_tensor])
-        test_pass_acc.add(value=acc, weight=weight)
-        pass_acc = test_pass_acc.eval()
-    return pass_acc
-
-
-def run_benchmark(model, args):
-    if args.use_cprof:
-        pr = cProfile.Profile()
-        pr.enable()
-    start_time = time.time()
+def get_model(args):
    # Input data
    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')

    # Train program
-    predict = model(images)
+    predict = cnn_model(images)
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)

@@ -143,86 +85,10 @@ def run_benchmark(model, args):
    # Optimization
    opt = fluid.optimizer.AdamOptimizer(
        learning_rate=0.001, beta1=0.9, beta2=0.999)
-    opt.minimize(avg_cost)
-
-    fluid.memory_optimize(fluid.default_main_program())
-
-    # Initialize executor
-    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
-    exe = fluid.Executor(place)
-
-    # Parameter initialization
-    exe.run(fluid.default_startup_program())

    # Reader
    train_reader = paddle.batch(
        paddle.dataset.mnist.train(), batch_size=args.batch_size)
-
-    accuracy = fluid.metrics.Accuracy()
-    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        accuracy.reset()
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            img_data = np.array(
-                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([len(y_data), 1])
-
-            outs = train_exe.run(
-                feed={"pixel": img_data,
-                      "label": y_data},
-                fetch_list=[
-                    avg_cost.name, batch_acc.name, batch_size_tensor.name
-                ]
-            )  # The accuracy is the accumulation of batches, but not the current batch.
-            accuracy.update(
-                value=np.array(np.mean(outs[1])),
-                weight=np.mean(np.array(outs[2])))
-            iters += 1
-            num_samples += len(y_data)
-            loss = np.mean(np.array(outs[0]))
-            acc = np.mean(np.array(outs[1]))
-            train_losses.append(loss)
-            train_accs.append(acc)
-            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
-                  (pass_id, iters, loss, acc))
-
-        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
-              (pass_id, np.mean(train_losses), np.mean(train_accs)))
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        # evaluation
-        if args.with_test:
-            test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
-                                     inference_program)
-        exit(0)
-
-
-def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
-    print('----------- mnist Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    print_arguments(args)
-    if args.use_nvprof and args.device == 'GPU':
-        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
-            run_benchmark(cnn_model, args)
-    else:
-        run_benchmark(cnn_model, args)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
@@ -16,7 +16,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import argparse
 import functools
 import numpy as np
 import time
@@ -29,64 +28,6 @@ import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler


-def parse_args():
-    parser = argparse.ArgumentParser('Convolution model benchmark.')
-    parser.add_argument(
-        '--model',
-        type=str,
-        choices=['resnet_imagenet', 'resnet_cifar10'],
-        default='resnet_imagenet',
-        help='The model architecture.')
-    parser.add_argument(
-        '--batch_size', type=int, default=32, help='The minibatch size.')
-    parser.add_argument(
-        '--use_fake_data',
-        action='store_true',
-        help='use real data or fake data')
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=100, help='The number of passes.')
-    parser.add_argument(
-        '--data_format',
-        type=str,
-        default='NCHW',
-        choices=['NCHW', 'NHWC'],
-        help='The data data_format, now only support NCHW.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--data_set',
-        type=str,
-        default='flowers',
-        choices=['cifar10', 'flowers'],
-        help='Optional dataset for benchmark.')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
-    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    parser.add_argument(
-        '--with_test',
-        action='store_true',
-        help='If set, test the testset during training.')
-    args = parser.parse_args()
-    return args
-
-
 def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
    conv1 = fluid.layers.conv2d(
        input=input,
@@ -100,7 +41,7 @@ def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):


 def shortcut(input, ch_out, stride):
-    ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1]
+    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
    if ch_in != ch_out:
        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
    else:
@@ -172,23 +113,22 @@ def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
    return out


-def run_benchmark(model, args):
-    if args.use_cprof:
-        pr = cProfile.Profile()
-        pr.enable()
-
+def get_model(args):
+    model = resnet_cifar10
    if args.data_set == "cifar10":
        class_dim = 10
        if args.data_format == 'NCHW':
            dshape = [3, 32, 32]
        else:
            dshape = [32, 32, 3]
+        model = resnet_cifar10
    else:
        class_dim = 102
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]
+        model = resnet_imagenet

    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@@ -206,9 +146,6 @@ def run_benchmark(model, args):
            target_vars=[batch_acc, batch_size_tensor])

    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-    opts = optimizer.minimize(avg_cost)
-
-    fluid.memory_optimize(fluid.default_main_program())

    train_reader = paddle.batch(
        paddle.reader.shuffle(
@@ -221,97 +158,4 @@ def run_benchmark(model, args):
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
        batch_size=args.batch_size)

-    def test(exe):
-        test_accuracy = fluid.average.WeightedAverage()
-        for batch_id, data in enumerate(test_reader()):
-            img_data = np.array(map(lambda x: x[0].reshape(dshape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            acc, weight = exe.run(inference_program,
-                                  feed={"data": img_data,
-                                        "label": y_data},
-                                  fetch_list=[batch_acc, batch_size_tensor])
-            test_accuracy.add(value=acc, weight=weight)
-
-        return test_accuracy.eval()
-
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-    accuracy = fluid.average.WeightedAverage()
-    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
-    if args.use_fake_data:
-        data = train_reader().next()
-        image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
-            'float32')
-        label = np.array(map(lambda x: x[1], data)).astype('int64')
-        label = label.reshape([-1, 1])
-
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        accuracy.reset()
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            if not args.use_fake_data:
-                image = np.array(map(lambda x: x[0].reshape(dshape),
-                                     data)).astype('float32')
-                label = np.array(map(lambda x: x[1], data)).astype('int64')
-                label = label.reshape([-1, 1])
-            loss, acc, weight = train_exe.run(
-                feed={'data': image,
-                      'label': label},
-                fetch_list=[
-                    avg_cost.name, batch_acc.name, batch_size_tensor.name
-                ])
-            iters += 1
-            num_samples += len(label)
-            accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
-            loss = np.mean(np.array(loss))
-            acc = np.mean(np.array(acc))
-            train_losses.append(loss)
-            train_accs.append(acc)
-            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
-                  (pass_id, iters, loss, acc))
-        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
-              (pass_id, np.mean(train_losses), np.mean(train_accs)))
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        # evaluation
-        if args.with_test:
-            pass_test_acc = test(exe)
-        exit(0)
-
-
-def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
-    print('----------- resnet Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    model_map = {
-        'resnet_imagenet': resnet_imagenet,
-        'resnet_cifar10': resnet_cifar10
-    }
-    args = parse_args()
-    print_arguments(args)
-    if args.data_format == 'NHWC':
-        raise ValueError('Only support NCHW data_format now.')
-    if args.use_nvprof and args.device == 'GPU':
-        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
-            run_benchmark(model_map[args.model], args)
-    else:
-        run_benchmark(model_map[args.model], args)
+    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@@ -29,57 +29,6 @@ import paddle.fluid as fluid
 import paddle.batch as batch
 import paddle.fluid.profiler as profiler

-
-def parse_args():
-    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=32,
-        help='The sequence number of a batch data. (default: %(default)d)')
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
-    parser.add_argument(
-        '--emb_dim',
-        type=int,
-        default=512,
-        help='Dimension of embedding table. (default: %(default)d)')
-    parser.add_argument(
-        '--hidden_dim',
-        type=int,
-        default=512,
-        help='Hidden size of lstm unit. (default: %(default)d)')
-    parser.add_argument(
-        '--pass_num',
-        type=int,
-        default=100,
-        help='Epoch number to train. (default: %(default)d)')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='CPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--crop_size',
-        type=int,
-        default=int(os.environ.get('CROP_SIZE', '1500')),
-        help='The max sentence length of input. Since this model use plain RNN,'
-        ' Gradient could be explored if sentence is too long')
-    parser.add_argument(
-        '--with_test',
-        action='store_true',
-        help='If set, test the testset during training.')
-    args = parser.parse_args()
-    return args
-
-
 word_dict = imdb.word_dict()


@@ -94,14 +43,15 @@ def crop_sentence(reader, crop_size):
    return __impl__


-def main():
-    args = parse_args()
-    lstm_size = args.hidden_dim
+def get_model(args):
+    lstm_size = 512
+    emb_dim = 512
+    crop_size = 1500

    data = fluid.layers.data(
        name="words", shape=[1], lod_level=1, dtype='int64')
    sentence = fluid.layers.embedding(
-        input=data, size=[len(word_dict), args.emb_dim])
+        input=data, size=[len(word_dict), emb_dim])

    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')

@@ -161,51 +111,17 @@ def main():
            target_vars=[batch_acc, batch_size_tensor])

    adam = fluid.optimizer.Adam()
-    adam.minimize(loss)
-
-    fluid.memory_optimize(fluid.default_main_program())
-
-    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())

    train_reader = batch(
        paddle.reader.shuffle(
-            crop_sentence(imdb.train(word_dict), args.crop_size),
-            buf_size=25000),
+            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
+        batch_size=args.batch_size)
+    test_reader = batch(
+        paddle.reader.shuffle(
+            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
        batch_size=args.batch_size)

-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            tensor_words = to_lodtensor([x[0] for x in data], place)
-            label = numpy.array([x[1] for x in data]).astype("int64")
-            label = label.reshape((-1, 1))
-            loss_np, acc, weight = exe.run(
-                fluid.default_main_program(),
-                feed={"words": tensor_words,
-                      "label": label},
-                fetch_list=[loss, batch_acc, batch_size_tensor])
-            iters += 1
-            for x in data:
-                num_samples += len(x[0])
-            print(
-                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
-                (pass_id, iters, loss_np, acc)
-            )  # The accuracy is the accumulation of batches, but not the current batch.
-
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        exit(0)
+    return loss, inference_program, adam, train_reader, test_reader, batch_acc


 def to_lodtensor(data, place):
@@ -221,16 +137,3 @@ def to_lodtensor(data, place):
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res
-
-
-def print_arguments(args):
-    print('----------- lstm Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    print_arguments(args)
-    main()
--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@@ -23,46 +23,6 @@ import paddle.fluid.core as core
 import argparse
 import functools

-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
-    '--skip_batch_num',
-    type=int,
-    default=5,
-    help='The first num of minibatch num to skip, for better performance test')
-parser.add_argument(
-    '--iterations', type=int, default=80, help='The number of minibatches.')
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='GPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NCHW',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, now only support NCHW.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-parser.add_argument(
-    '--with_test',
-    action='store_true',
-    help='If set, test the testset during training.')
-args = parser.parse_args()
-

 def vgg16_bn_drop(input):
    def conv_block(input, num_filter, groups, dropouts):
@@ -91,7 +51,7 @@ def vgg16_bn_drop(input):
    return fc2


-def main():
+def get_model(args):
    if args.data_set == "cifar10":
        classdim = 10
        if args.data_format == 'NCHW':
@@ -128,16 +88,6 @@ def main():

    # Optimization
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    opts = optimizer.minimize(avg_cost)
-
-    fluid.memory_optimize(fluid.default_main_program())
-
-    # Initialize executor
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = fluid.Executor(place)
-
-    # Parameter initialization
-    exe.run(fluid.default_startup_program())

    # data reader
    train_reader = paddle.batch(
@@ -151,78 +101,4 @@ def main():
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
        batch_size=args.batch_size)

-    # test
-    def test(exe):
-        test_accuracy = fluid.average.WeightedAverage()
-        for batch_id, data in enumerate(test_reader()):
-            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            acc, weight = exe.run(inference_program,
-                                  feed={"pixel": img_data,
-                                        "label": y_data},
-                                  fetch_list=[batch_acc, batch_size_tensor])
-            test_accuracy.add(value=acc, weight=weight)
-        return test_accuracy.eval()
-
-    iters, num_samples, start_time = 0, 0, time.time()
-    accuracy = fluid.average.WeightedAverage()
-    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
-    for pass_id in range(args.pass_num):
-        accuracy.reset()
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            loss, acc, weight = train_exe.run(
-                feed={"pixel": img_data,
-                      "label": y_data},
-                fetch_list=[
-                    avg_cost.name, batch_acc.name, batch_size_tensor.name
-                ])
-            accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
-            iters += 1
-            num_samples += len(y_data)
-            loss = np.mean(np.array(loss))
-            acc = np.mean(np.array(acc))
-            print(
-                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
-                (pass_id, iters, loss, acc)
-            )  # The accuracy is the accumulation of batches, but not the current batch.
-
-        # pass_train_acc = accuracy.eval()
-        train_losses.append(loss)
-        train_accs.append(acc)
-        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
-              (pass_id, np.mean(train_losses), np.mean(train_accs)))
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        # evaluation
-        if args.with_test:
-            pass_test_acc = test(exe)
-        exit(0)
-
-
-def print_arguments():
-    print('----------- vgg Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == "__main__":
-    print_arguments()
-    main()
+    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
-# util to check C++ file style
-# * it basically use google cpplint.py.
-# * It provide "add_style_check_target" for cmake.
-#   Usage see add_style_check_target's document
-#
-# TODO(yuyang18): Add python style check.
-
-set(STYLE_FILTER)
-
-# diable unwanted filters
-
-# paddle do not indent public/potected/private in class
-set(STYLE_FILTER "${STYLE_FILTER}-whitespace/indent,")
-# paddle use mutable reference. BUT IT IS NOT RECOMMANDED
-set(STYLE_FILTER "${STYLE_FILTER}-runtime/references,")
-# paddle use relative path for include.
-set(STYLE_FILTER "${STYLE_FILTER}-build/include,")
-# paddle use <thread>, <mutex>, etc.
-set(STYLE_FILTER "${STYLE_FILTER}-build/c++11,")
-# paddle use c style casting. BUT IT IS NOT RECOMMANDED
-set(STYLE_FILTER "${STYLE_FILTER}-readability/casting")
-
-
-# IGNORE SOME FILES
-set(IGNORE_PATTERN
-    .*ImportanceSampler.*
-    .*cblas\\.h.*
-    .*\\.pb\\.txt
-    .*MultiDataProvider.*
-    .*pb.*
-    .*pybind.h)
-
-# add_style_check_target
-#
-# attach check code style step for target.
-#
-# first argument: target name to attach
-# rest arguments: source list to check code style.
-#
-# NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing.
-macro(add_style_check_target TARGET_NAME)
-    if(WITH_STYLE_CHECK)
-        set(SOURCES_LIST ${ARGN})
-        list(REMOVE_DUPLICATES SOURCES_LIST)
-        foreach(filename ${SOURCES_LIST})
-            foreach(pattern ${IGNORE_PATTERN})
-                if(filename MATCHES ${pattern})
-                    list(REMOVE_ITEM SOURCES_LIST ${filename})
-                endif()
-            endforeach()
-        endforeach()
-
-        if(SOURCES_LIST)
-            add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-                COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py"
-                        "--filter=${STYLE_FILTER}"
-                        ${SOURCES_LIST}
-                COMMENT "cpplint: Checking source code style"
-                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})        
-        endif()
-    endif()
-endmacro()
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -23,17 +23,20 @@ SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
 SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+
+include(ProcessorCount)
+ProcessorCount(NUM_OF_PROCESSOR)
+
 IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
 ELSE()
-  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin)
 ENDIF()

 ExternalProject_Add(
    extern_grpc
    DEPENDS protobuf zlib
-    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.10.x"
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc.tar.xz"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -206,8 +206,6 @@ function(cc_library TARGET_NAME)
        list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
      endif()
    endforeach()
-    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
-
  else(cc_library_SRCS)
    if(cc_library_DEPS)
      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
@@ -231,7 +229,7 @@ endfunction(cc_binary)

 function(cc_test TARGET_NAME)
  if(WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS ARGS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -241,6 +239,9 @@ function(cc_test TARGET_NAME)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if (${cc_test_SERIAL})
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    endif()
  endif()
 endfunction(cc_test)

@@ -268,7 +269,6 @@ function(nv_library TARGET_NAME)
          list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
        endif()
      endforeach()
-      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
    else(nv_library_SRCS)
      if (nv_library_DEPS)
        merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
@@ -295,7 +295,7 @@ endfunction(nv_binary)

 function(nv_test TARGET_NAME)
  if (WITH_GPU AND WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -303,6 +303,9 @@ function(nv_test TARGET_NAME)
    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    add_test(${TARGET_NAME} ${TARGET_NAME})
+    if (nv_test_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    endif()
  endif()
 endfunction(nv_test)

@@ -338,7 +341,6 @@ function(hip_library TARGET_NAME)
 	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
 	endif()
      endforeach()
-      add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
    else(hip_library_SRCS)
      if (hip_library_DEPS)
 	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -52,32 +52,32 @@ function(copy TARGET)
 endfunction()

 # third party
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
 copy(eigen3_lib
  SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
  DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
 )

-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags")
 copy(gflags_lib
  SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
  DSTS ${dst_dir} ${dst_dir}/lib
 )

-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog")
 copy(glog_lib
  SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
  DSTS ${dst_dir} ${dst_dir}/lib
 )

-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/boost/")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/")
 copy(boost_lib
  SRCS ${BOOST_INCLUDE_DIR}/boost
  DSTS ${dst_dir}
 )

 if(NOT PROTOBUF_FOUND)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
    copy(protobuf_lib
      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
      DSTS ${dst_dir} ${dst_dir}/lib
@@ -85,13 +85,13 @@ if(NOT PROTOBUF_FOUND)
 endif()

 if(NOT CBLAS_FOUND)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
    copy(openblas_lib
      SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
      DSTS ${dst_dir} ${dst_dir}
    )
 elseif (WITH_MKLML)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
    copy(mklml_lib
      SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
      DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
@@ -99,7 +99,7 @@ elseif (WITH_MKLML)
 endif()

 if(WITH_MKLDNN)
-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mkldnn")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
  copy(mkldnn_lib
    SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
    DSTS ${dst_dir} ${dst_dir}/lib
@@ -107,17 +107,17 @@ if(WITH_MKLDNN)
 endif()

 if(NOT MOBILE_INFERENCE AND NOT RPI)
-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
  copy(snappy_lib
    SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
    DSTS ${dst_dir} ${dst_dir}/lib)

-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
  copy(snappystream_lib
    SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
    DSTS ${dst_dir} ${dst_dir}/lib)

-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
  copy(zlib_lib
    SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
    DSTS ${dst_dir} ${dst_dir}/lib)
@@ -125,7 +125,7 @@ endif()

 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
+set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 copy(framework_lib DEPS framework_py_proto 
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
@@ -165,15 +165,16 @@ copy(pybind_lib
 # CMakeCache Info
 copy(cmake_cache
  SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-  DSTS ${CMAKE_INSTALL_PREFIX})
+  DSTS ${FLUID_INSTALL_DIR})

 add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 

 # paddle fluid version
 execute_process(
  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
  OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-set(version_file ${CMAKE_INSTALL_PREFIX}/version.txt)
+set(version_file ${FLUID_INSTALL_DIR}/version.txt)
 file(WRITE ${version_file}
  "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
  "WITH_MKL: ${WITH_MKL}\n"

--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
@@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")

+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
 configure_file(
    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
    "${BINARY_BUILD_DIR_EN}/conf.py"
@@ -27,8 +30,6 @@ sphinx_add_target(paddle_fluid_docs
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})

-add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)
-
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")

@@ -50,6 +51,4 @@ sphinx_add_target(paddle_fluid_docs_cn
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_CN})

-add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)
-
 add_subdirectory(api)
--- a/doc/fluid/api/CMakeLists.txt
+++ b/doc/fluid/api/CMakeLists.txt
@@ -7,6 +7,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")

+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
 configure_file(
    "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
    "${BINARY_BUILD_DIR_EN}/conf.py"

--- a/doc/fluid/api/clip.rst
+++ b/doc/fluid/api/clip.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+====
+clip
+====
+
+ErrorClipByValue
+----------------
+
+..  autoclass:: paddle.fluid.clip.ErrorClipByValue
+    :members:
+    :noindex:
+
+GradientClipByValue
+-------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByValue
+    :members:
+    :noindex:
+
+GradientClipByNorm
+------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByNorm
+    :members:
+    :noindex:
+
+GradientClipByGlobalNorm
+------------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
+    :members:
+    :noindex:
+
+append_gradient_clip_ops
+------------------------
+
+..  autofunction:: paddle.fluid.clip.append_gradient_clip_ops
+    :noindex:
+
+error_clip_callback
+-------------------
+
+..  autofunction:: paddle.fluid.clip.error_clip_callback
+    :noindex:
+
--- a/doc/fluid/api/evaluator.rst
+++ b/doc/fluid/api/evaluator.rst
@@ -5,24 +5,3 @@
 evaluator
 =========

-ChunkEvaluator
--------------
-
-..  autoclass:: paddle.fluid.evaluator.ChunkEvaluator
-    :members:
-    :noindex:
-
-EditDistance
--------------
-
-..  autoclass:: paddle.fluid.evaluator.EditDistance
-    :members:
-    :noindex:
-
-DetectionMAP
--------------
-
-..  autoclass:: paddle.fluid.evaluator.DetectionMAP
-    :members:
-    :noindex:
-  
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
@@ -30,3 +30,9 @@ switch_scope
 ..  autofunction:: paddle.fluid.executor.switch_scope
    :noindex:

+fetch_var
+---------
+
+..  autofunction:: paddle.fluid.executor.fetch_var
+    :noindex:
+
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
 #!/bin/bash
 python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst

-for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
 do
  python gen_doc.py ${module} > ${module}.rst
 done
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
@@ -9,8 +9,9 @@ Fluid
    data_feeder.rst
    executor.rst
    initializer.rst
-    evaluator.rst
+    metrics.rst
    nets.rst
+    clip.rst
    optimizer.rst
    param_attr.rst
    profiler.rst

--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -33,11 +33,16 @@ Xavier
    :members:
    :noindex:

-MSRA
------
+force_init_on_cpu
+-----------------

-..  autoclass:: paddle.fluid.initializer.MSRA
-    :members:
+..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
+    :noindex:
+
+init_on_cpu
+-----------
+
+..  autofunction:: paddle.fluid.initializer.init_on_cpu
    :noindex:

 ConstantInitializer
@@ -68,9 +73,3 @@ XavierInitializer
    :members:
    :noindex:

-
-MSRAInitializer
-----------------
-..  autoclass:: paddle.fluid.initializer.MSRAInitializer
-    :members:
-    :noindex:
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -55,6 +55,13 @@ While
    :members:
    :noindex:

+Switch
+------
+
+..  autoclass:: paddle.fluid.layers.Switch
+    :members:
+    :noindex:
+
 lod_rank_table
 --------------

@@ -67,12 +74,6 @@ max_sequence_len
 ..  autofunction:: paddle.fluid.layers.max_sequence_len
    :noindex:

-topk
----
-
-..  autofunction:: paddle.fluid.layers.topk
-    :noindex:
-
 lod_tensor_to_array
 -------------------

@@ -109,6 +110,12 @@ less_than
 ..  autofunction:: paddle.fluid.layers.less_than
    :noindex:

+equal
+-----
+
+..  autofunction:: paddle.fluid.layers.equal
+    :noindex:
+
 array_read
 ----------

@@ -212,6 +219,42 @@ Send
 ..  autofunction:: paddle.fluid.layers.Send
    :noindex:

+open_recordio_file
+------------------
+
+..  autofunction:: paddle.fluid.layers.open_recordio_file
+    :noindex:
+
+open_files
+----------
+
+..  autofunction:: paddle.fluid.layers.open_files
+    :noindex:
+
+read_file
+---------
+
+..  autofunction:: paddle.fluid.layers.read_file
+    :noindex:
+
+shuffle
+-------
+
+..  autofunction:: paddle.fluid.layers.shuffle
+    :noindex:
+
+batch
+-----
+
+..  autofunction:: paddle.fluid.layers.batch
+    :noindex:
+
+double_buffer
+-------------
+
+..  autofunction:: paddle.fluid.layers.double_buffer
+    :noindex:
+
 nn
 ==

@@ -281,12 +324,6 @@ square_error_cost
 ..  autofunction:: paddle.fluid.layers.square_error_cost
    :noindex:

-accuracy
--------
-
-..  autofunction:: paddle.fluid.layers.accuracy
-    :noindex:
-
 chunk_eval
 ----------

@@ -311,6 +348,18 @@ sequence_pool
 ..  autofunction:: paddle.fluid.layers.sequence_pool
    :noindex:

+sequence_softmax
+----------------
+
+..  autofunction:: paddle.fluid.layers.sequence_softmax
+    :noindex:
+
+softmax
+-------
+
+..  autofunction:: paddle.fluid.layers.softmax
+    :noindex:
+
 pool2d
 ------

@@ -323,12 +372,6 @@ batch_norm
 ..  autofunction:: paddle.fluid.layers.batch_norm
    :noindex:

-layer_norm
----------
-
-..  autofunction:: paddle.fluid.layers.layer_norm
-    :noindex:
-
 beam_search_decode
 ------------------

@@ -377,6 +420,12 @@ reduce_min
 ..  autofunction:: paddle.fluid.layers.reduce_min
    :noindex:

+reduce_prod
+-----------
+
+..  autofunction:: paddle.fluid.layers.reduce_prod
+    :noindex:
+
 sequence_first_step
 -------------------

@@ -425,6 +474,12 @@ matmul
 ..  autofunction:: paddle.fluid.layers.matmul
    :noindex:

+topk
+----
+
+..  autofunction:: paddle.fluid.layers.topk
+    :noindex:
+
 warpctc
 -------

@@ -473,6 +528,60 @@ multiplex
 ..  autofunction:: paddle.fluid.layers.multiplex
    :noindex:

+layer_norm
+----------
+
+..  autofunction:: paddle.fluid.layers.layer_norm
+    :noindex:
+
+softmax_with_cross_entropy
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
+    :noindex:
+
+smooth_l1
+---------
+
+..  autofunction:: paddle.fluid.layers.smooth_l1
+    :noindex:
+
+one_hot
+-------
+
+..  autofunction:: paddle.fluid.layers.one_hot
+    :noindex:
+
+autoincreased_step_counter
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.autoincreased_step_counter
+    :noindex:
+
+reshape
+-------
+
+..  autofunction:: paddle.fluid.layers.reshape
+    :noindex:
+
+lod_reset
+---------
+
+..  autofunction:: paddle.fluid.layers.lod_reset
+    :noindex:
+
+lrn
+---
+
+..  autofunction:: paddle.fluid.layers.lrn
+    :noindex:
+
+pad
+---
+
+..  autofunction:: paddle.fluid.layers.pad
+    :noindex:
+
 label_smooth
 ------------

@@ -480,12 +589,12 @@ label_smooth
    :noindex:

 roi_pool
---------
+--------

 ..  autofunction:: paddle.fluid.layers.roi_pool
    :noindex:

-    
+
 ops
 ===

@@ -501,18 +610,6 @@ mul
 ..  autofunction:: paddle.fluid.layers.mul
    :noindex:

-reshape
-------
-
-..  autofunction:: paddle.fluid.layers.reshape
-    :noindex:
-
-pad
---
-
-..  autofunction:: paddle.fluid.layers.pad
-    :noindex:
-
 scale
 -----

@@ -579,10 +676,70 @@ clip_by_norm
 ..  autofunction:: paddle.fluid.layers.clip_by_norm
    :noindex:

-sequence_softmax
----------------
+logical_and
+-----------

-..  autofunction:: paddle.fluid.layers.sequence_softmax
+..  autofunction:: paddle.fluid.layers.logical_and
+    :noindex:
+
+logical_or
+----------
+
+..  autofunction:: paddle.fluid.layers.logical_or
+    :noindex:
+
+logical_xor
+-----------
+
+..  autofunction:: paddle.fluid.layers.logical_xor
+    :noindex:
+
+logical_not
+-----------
+
+..  autofunction:: paddle.fluid.layers.logical_not
+    :noindex:
+
+uniform_random
+--------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random
+    :noindex:
+
+uniform_random_batch_size_like
+------------------------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
+    :noindex:
+
+gaussian_random
+---------------
+
+..  autofunction:: paddle.fluid.layers.gaussian_random
+    :noindex:
+
+gaussian_random_batch_size_like
+-------------------------------
+
+..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
+    :noindex:
+
+cumsum
+------
+
+..  autofunction:: paddle.fluid.layers.cumsum
+    :noindex:
+
+scatter
+-------
+
+..  autofunction:: paddle.fluid.layers.scatter
+    :noindex:
+
+sum
+---
+
+..  autofunction:: paddle.fluid.layers.sum
    :noindex:

 sigmoid
@@ -651,6 +808,18 @@ floor
 ..  autofunction:: paddle.fluid.layers.floor
    :noindex:

+cos
+---
+
+..  autofunction:: paddle.fluid.layers.cos
+    :noindex:
+
+sin
+---
+
+..  autofunction:: paddle.fluid.layers.sin
+    :noindex:
+
 round
 -----

@@ -828,4 +997,15 @@ topk
 ..  autofunction:: paddle.fluid.layers.topk
    :noindex:

+dice_loss
+----
+
+..  autofunction:: paddle.fluid.layers.dice_loss
+    :noindex:
+
+bilinear_interp
+____
+
+..  autofunction:: paddle.fluid.layers.bilinear_interp
+    :noindex:

--- a/doc/fluid/api/metrics.rst
+++ b/doc/fluid/api/metrics.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=======
+metrics
+=======
+
+MetricBase
+----------
+
+..  autoclass:: paddle.fluid.metrics.MetricBase
+    :members:
+    :noindex:
+
+CompositeMetric
+---------------
+
+..  autoclass:: paddle.fluid.metrics.CompositeMetric
+    :members:
+    :noindex:
+
+Accuracy
+--------
+
+..  autoclass:: paddle.fluid.metrics.Accuracy
+    :members:
+    :noindex:
+
+ChunkEvaluator
+--------------
+
+..  autoclass:: paddle.fluid.metrics.ChunkEvaluator
+    :members:
+    :noindex:
+
+EditDistance
+------------
+
+..  autoclass:: paddle.fluid.metrics.EditDistance
+    :members:
+    :noindex:
+
+DetectionMAP
+------------
+
+..  autoclass:: paddle.fluid.metrics.DetectionMAP
+    :members:
+    :noindex:
+
+Auc
+---
+
+..  autoclass:: paddle.fluid.metrics.Auc
+    :members:
+    :noindex:
+
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -47,6 +47,28 @@ DecayedAdagrad
    :members:
    :noindex:

+Adadelta
+-----------------
+
+..  autoclass:: paddle.fluid.optimizer.Adadelta
+    :members:
+    :noindex:
+
+RMSProp
+-----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSProp
+    :members:
+    :noindex:
+
+ModelAverage
+-----------------
+
+..  autoclass:: paddle.fluid.optimizer.ModelAverage
+    :members:
+    :noindex:
+
+
 SGDOptimizer
 ------------

@@ -89,9 +111,25 @@ DecayedAdagradOptimizer
    :members:
    :noindex:

-Adadelta
--------------
+
+AdadeltaOptimizer
+-----------------

 ..  autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
    :members:
    :noindex:
+
+
+RMSPropOptimizer
+-----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+    
+Optimizer
+---------
+
+..  autoclass:: paddle.fluid.optimizer.Optimizer
+    :members:
+    :noindex:
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
@@ -11,6 +11,13 @@ append_regularization_ops
 ..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
    :noindex:

+WeightDecayRegularizer
+----------------------
+
+..  autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer
+    :members:
+    :noindex:
+
 L1Decay
 -------

@@ -26,15 +33,16 @@ L2Decay
    :noindex:

 L1DecayRegularizer
---------------------
+------------------

 ..  autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
    :members:
    :noindex:

 L2DecayRegularizer
---------------------
+------------------

 ..  autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
    :members:
    :noindex:
+
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
@@ -3,5 +3,6 @@

 .. toctree::
  :maxdepth: 1
-  
+
  optimization/index_cn.rst
+  inference/inference_support_in_fluid.md
--- a/doc/fluid/howto/index_en.rst
+++ b/doc/fluid/howto/index_en.rst
@@ -5,3 +5,4 @@ HOW TO
  :maxdepth: 1

  optimization/index_en.rst
+  inference/inference_support_in_fluid.md
--- a/doc/fluid/howto/inference/inference_support_in_fluid.md
+++ b/doc/fluid/howto/inference/inference_support_in_fluid.md
+# Fluid Inference使用指南
+
+## 目录：
+
+- Python Inference API
+- 编译Fluid Inference库
+- Inference C++ API
+- Inference实例
+- Inference计算优化
+
+## Python Inference API **[改进中]**
+- 保存Inference模型 ([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L295))
+
+  ```python
+  def save_inference_model(dirname,
+                           feeded_var_names,
+                           target_vars,
+                           executor,
+                           main_program=None,
+                           model_filename=None,
+                           params_filename=None):
+  ```
+  Inference模型和参数将会保存到`dirname`目录下：
+  - 序列化的模型
+    - `model_filename`为`None`，保存到`dirname/__model__`
+    - `model_filename`非`None`，保存到`dirname/model_filename`
+  - 参数
+    - `params_filename`为`None`，单独保存到各个独立的文件，各文件以参数变量的名字命名
+    - `params_filename`非`None`，保存到`dirname/params_filename`
+
+- 两种存储格式
+  - 参数保存到各个独立的文件
+    - 如，设置`model_filename`为`None`、`params_filename`为`None`
+
+    ```bash
+    $ cd recognize_digits_conv.inference.model
+    $ ls
+    $ __model__ batch_norm_1.w_0 batch_norm_1.w_2 conv2d_2.w_0 conv2d_3.w_0 fc_1.w_0 batch_norm_1.b_0 batch_norm_1.w_1 conv2d_2.b_0 conv2d_3.b_0 fc_1.b_0
+    ```
+  - 参数保存到同一个文件
+    - 如，设置`model_filename`为`None`、`params_filename`为`__params__`
+
+    ```bash
+    $ cd recognize_digits_conv.inference.model
+    $ ls
+    $ __model__ __params__
+    ```
+- 加载Inference模型([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L380))
+  ```python
+  def load_inference_model(dirname,
+                           executor,
+                           model_filename=None,
+                           params_filename=None):
+    ...
+    return [program, feed_target_names, fetch_targets]
+  ```
+
+
+## 编译Fluid Inference库
+
+  - **不需要额外的CMake选项**
+    - 1、 配置CMake命令，更多配置请参考[源码编译PaddlePaddle](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html)
+      ```bash
+      $ git clone https://github.com/PaddlePaddle/Paddle.git
+      $ cd Paddle
+      $ mkdir build
+      $ cd build
+      $ cmake -DCMAKE_INSTALL_PREFIX=your/path/to/paddle_inference_lib \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DWITH_PYTHON=ON \
+          -DWITH_MKL=OFF \
+          -DWITH_GPU=OFF \
+          ..
+      ```
+
+    - 2、 编译PaddlePaddle
+      ```bash
+      $ make
+      ```
+
+    - 3、 部署。执行如下命令将PaddlePaddle Fluid Inference库部署到`your/path/to/paddle_inference_lib`目录。
+      ```bash
+      $ make inference_lib_dist
+      ```
+
+- 目录结构
+
+  ```bash
+  $ cd your/path/to/paddle_inference_lib
+  $ tree
+  .
+  |-- paddle
+  |   `-- fluid
+  |       |-- framework
+  |       |-- inference
+  |       |   |-- io.h
+  |       |   `-- libpaddle_fluid.so
+  |       |-- memory
+  |       |-- platform
+  |       `-- string
+  |-- third_party
+  |   |-- eigen3
+  |   `-- install
+  |       |-- gflags
+  |       |-- glog
+  |       `-- protobuf
+  `-- ...
+  ```
+
+  假设`PADDLE_ROOT=your/path/to/paddle_inference_lib`。
+
+
+
+## 链接Fluid Inference库
+- 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))
+
+  - GCC配置
+    ```bash
+    $ g++ -o a.out -std=c++11 main.cc \
+          -I${PADDLE_ROOT}/ \
+          -I${PADDLE_ROOT}/third_party/install/gflags/include \
+          -I${PADDLE_ROOT}/third_party/install/glog/include \
+          -I${PADDLE_ROOT}/third_party/install/protobuf/include \
+          -I${PADDLE_ROOT}/third_party/eigen3 \
+          -L${PADDLE_ROOT}/paddle/fluid/inference -lpaddle_fluid \
+          -lrt -ldl -lpthread
+    ```
+
+  - CMake配置
+    ```cmake
+    include_directories(${PADDLE_ROOT}/)
+    include_directories(${PADDLE_ROOT}/third_party/install/gflags/include)
+    include_directories(${PADDLE_ROOT}/third_party/install/glog/include)
+    include_directories(${PADDLE_ROOT}/third_party/install/protobuf/include)
+    include_directories(${PADDLE_ROOT}/third_party/eigen3)
+    target_link_libraries(${TARGET_NAME}
+                          ${PADDLE_ROOT}/paddle/fluid/inference/libpaddle_fluid.so
+                          -lrt -ldl -lpthread)
+    ```
+
+  - 设置环境变量：
+  `export LD_LIBRARY_PATH=${PADDLE_ROOT}/paddle/fluid/inference:$LD_LIBRARY_PATH`
+
+
+
+## C++ Inference API
+
+- 推断流程([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_helper.h#L91))
+
+  - 1、 初始化设备
+    ```cpp
+    #include "paddle/fluid/framework/init.h"
+    paddle::framework::InitDevices(false);
+    ```
+
+  - 2、 定义place，executor，scope
+    ```cpp
+    auto place = paddle::platform::CPUPlace();
+    auto executor = paddle::framework::Executor(place);
+    auto* scope = new paddle::framework::Scope();
+    ```
+
+  - 3、 加载模型
+    ```cpp
+    #include "paddle/fluid/inference/io.h"
+    auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+    // or
+    auto inference_program = paddle::inference::Load(executor,
+                                                     *scope,
+                                                     dirname + "/" + model_filename,
+                                                     dirname + "/" + params_filename);
+    ```
+
+  - 4、 获取`feed_target_names`和`fetch_target_names`
+    ```cpp
+    const std::vector<std::string>& feed_target_names = inference_program->GetFeedTargetNames();
+    const std::vector<std::string>& fetch_target_names = inference_program->GetFetchTargetNames();
+    ```
+
+  - 5、 准备`feed`数据
+    ```cpp
+    #include "paddle/fluid/framework/lod_tensor.h"
+    std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+    ...
+    std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+    for (size_t i = 0; i < feed_target_names.size(); ++i) {
+      // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+      feed_targets[feed_target_names[i]] = cpu_feeds[i];
+    }
+    ```
+
+  - 6、 定义`Tensor`来`fetch`结果
+    ```cpp
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs;
+    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+    for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+      fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+    }
+    ```
+
+  - 7、 执行`inference_program`
+    ```cpp
+    executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    ```
+
+  - 8、 使用`fetch`数据
+    ```cpp
+    for (size_t i = 0; i < cpu_fetchs.size(); ++i) {
+      std::cout << "lod_i: " << cpu_fetchs[i]->lod();
+      std::cout << "dims_i: " << cpu_fetchs[i]->dims();
+      std::cout << "result:";
+      float* output_ptr = cpu_fetchs[i]->data<float>();
+      for (int j = 0; j < cpu_fetchs[i]->numel(); ++j) {
+        std::cout << " " << output_ptr[j];
+      }
+      std::cout << std::endl;
+    }
+    ```
+    针对不同的数据，4. - 8.可执行多次。
+
+  - 9、 释放内存
+    ```cpp
+    delete scope;
+    ```
+
+
+- 接口说明
+
+  ```cpp
+  void Run(const ProgramDesc& program, Scope* scope,
+           std::map<std::string, const LoDTensor*>& feed_targets,
+           std::map<std::string, LoDTensor*>& fetch_targets,
+           bool create_vars = true,
+           const std::string& feed_holder_name = "feed",
+           const std::string& fetch_holder_name = "fetch");
+  ```
+  - 使用Python API `save_inference_model`保存的`program`里面包含了`feed_op`和`fetch_op`，用户提供的`feed_targets`、`fetch_targets`必须和`inference_program`中的`feed_op`、`fetch_op`保持一致。
+  - 用户提供的`feed_holder_name`和`fetch_holder_name`也必须和`inference_program`中`feed_op`、`fetch_op`保持一致，可使用`SetFeedHolderName`和`SetFetchHolderName`接口重新设置`inferece_program`
+  - 默认情况下，除了`persistable`属性设置为`True`的`Variable`之外，每次执行`executor.Run`会创建一个局部`Scope`，并且在这个局部`Scope`中创建和销毁所有的`Variable`，以最小化空闲时的内存占用。
+  - `persistable`属性为`True`的`Variable`有：
+    - Operators的参数`w`、`b`等
+    - `feed_op`的输入变量
+    - `fetch_op`的输出变量
+
+
+- **不在每次执行时创建和销毁变量
+ ([PR](https://github.com/PaddlePaddle/Paddle/pull/9301))**
+  - 执行`inference_program`
+    ```cpp
+    // Call once
+    executor.CreateVariables(*inference_program, scope, 0);
+    // Call as many times as you like
+    executor.Run(
+        *inference_program, scope, feed_targets, fetch_targets, false);
+    ```
+  - **优点**
+    - 节省了频繁创建、销毁变量的时间（约占每次`Run`总时间的1% ~ 12%）
+    - 执行结束后可获取所有Operators的计算结果
+  - **缺点**
+    - 空闲时也会占用大量的内存
+    - 在同一个`Scope`中，相同的变量名是公用同一块内存的，容易引起意想不到的错误
+
+
+- **不在每次执行时创建Op([PR](https://github.com/PaddlePaddle/Paddle/pull/9630))**
+  - 执行`inference_program`
+    ```cpp
+    // Call once
+    auto ctx = executor.Prepare(*inference_program, 0);
+    // Call as many times as you like if you have no need to change the inference_program
+    executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets);
+    ```
+  - **优点**
+    - 节省了频繁创建、销毁Op的时间
+  - **缺点**
+    - 一旦修改了`inference_program`，则需要重新创建`ctx`
+
+
+- **多线程共享Parameters([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_multi_thread_helper.h))**
+  - 主线程
+    - 1、 初始化设备
+    - 2、 定义`place`，`executor`，`scope`
+    - 3、 加载模型，得到`inference_program`
+  - 从线程
+    - **复制`inference_program`得到`copy_program`，修改`copy_program`的`feed_holder_name`和`fetch_holder_name`**
+      ```cpp
+      auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
+                 new paddle::framework::ProgramDesc(*inference_program));
+      std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id);
+      std::string fetch_holder_name = "fetch_" + paddle::string::to_string(thread_id);
+      copy_program->SetFeedHolderName(feed_holder_name);
+      copy_program->SetFetchHolderName(fetch_holder_name);
+      ```
+    - 4、 获取`copy_program`的`feed_target_names`和`fetch_target_names`
+    - 5、 准备feed数据，定义Tensor来fetch结果
+    - 6、 执行`copy_program`
+      ```cpp
+      executor->Run(*copy_program, scope, feed_targets, fetch_targets, true, feed_holder_name, fetch_holder_name);
+      ```
+    - 7、 使用fetch数据
+  - 主线程
+    - 8、 释放资源
+
+
+- 基本概念
+  - 数据相关：
+    - [Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor.md)，一个N维数组，数据可以是任意类型（int，float，double等）
+    - [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)，带LoD(Level-of-Detail)即序列信息的Tensor
+    - [Scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)，记录了变量Variable
+  - 执行相关：
+    - [Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md)，无状态执行器，只跟设备相关
+    - Place
+      - CPUPlace，CPU设备
+      - CUDAPlace，CUDA GPU设备
+  - 神经网络表示：
+    - [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md).
+
+    详细介绍请参考[**Paddle Fluid开发者指南**](https://github.com/lcy-seso/learning_notes/blob/master/Fluid/developer's_guid_for_Fluid/Developer's_Guide_to_Paddle_Fluid.md)
+
+
+
+## Inference实例
+
+  1. fit a line: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc)
+  1. image classification: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_image_classification.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_image_classification.cc)
+  1. label semantic roles: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc)
+  1. recognize digits: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc)
+  1. recommender system: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recommender_system.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc)
+  1. understand sentiment: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_understand_sentiment.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc)
+  1. word2vec: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_word2vec.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_word2vec.cc)
+
+
+## Inference计算优化
+- 使用Python推理优化工具([inference_transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/inference_transpiler.py))
+  ```python
+  class InferenceTranspiler:
+    def transpile(self, program, place, scope=None):
+        ...
+        if scope is None:
+            scope = global_scope()
+        ...
+  ```
+  - 使用`InferenceTranspiler`将会直接修改`program`。
+  - 使用`InferenceTranspiler`会修改参数的值，请确保`program`的参数在`scope`内。
+- 支持的优化
+  - 融合batch_norm op的计算
+- 使用示例([链接](https://github.com/Xreki/Xreki.github.io/blob/master/fluid/inference/inference_transpiler.py))
+  ```python
+  import paddle.fluid as fluid
+  # NOTE: Applying the inference transpiler will change the inference_program.
+  t = fluid.InferenceTranspiler()
+  t.transpile(inference_program, place, inference_scope)
+  ```
+
+
+
+
+## 内存使用优化
+- 使用Python内存优化工具([memory_optimization_transipiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/memory_optimization_transpiler.py))
+  ```python
+  fluid.memory_optimize(inference_program)
+  ```
--- a/doc/mobile/CMakeLists.txt
+++ b/doc/mobile/CMakeLists.txt
@@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")

+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
 configure_file(
        "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
        "${BINARY_BUILD_DIR_EN}/conf.py"
@@ -27,8 +30,6 @@ sphinx_add_target(paddle_mobile_docs
        ${CMAKE_CURRENT_SOURCE_DIR}
        ${SPHINX_HTML_DIR_EN})

-add_dependencies(paddle_mobile_docs gen_proto_py paddle_python)
-
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")

@@ -49,5 +50,3 @@ sphinx_add_target(paddle_mobile_docs_cn
        ${SPHINX_CACHE_DIR_CN}
        ${CMAKE_CURRENT_SOURCE_DIR}
        ${SPHINX_HTML_DIR_CN})
-
-add_dependencies(paddle_mobile_docs_cn gen_proto_py paddle_python)
--- a/doc/mobile/index_cn.rst
+++ b/doc/mobile/index_cn.rst
 移动端
-=====
+======

 ..  toctree::
  :maxdepth: 1

  cross_compiling_for_android_cn.md
  cross_compiling_for_ios_cn.md
-  cross_compiling_for_raspberry_cn.md
\ No newline at end of file
+  cross_compiling_for_raspberry_cn.md
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -16,8 +16,8 @@ import os, subprocess
 sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-import paddle
-import paddle.v2
+@IMPORT_PADDLE_STRING@
+@IMPORT_PADDLEV2_STRING@

 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify

--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -16,8 +16,8 @@ import os, subprocess
 sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-import paddle
-import paddle.v2
+@IMPORT_PADDLE_STRING@
+@IMPORT_PADDLEV2_STRING@


 MarkdownParser = parser.CommonMarkParser

--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
@@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")

+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
 configure_file(
    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
    "${BINARY_BUILD_DIR_EN}/conf.py"
@@ -27,8 +30,6 @@ sphinx_add_target(paddle_v2_docs
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})

-add_dependencies(paddle_v2_docs gen_proto_py paddle_python)
-
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")

@@ -50,6 +51,4 @@ sphinx_add_target(paddle_v2_docs_cn
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_CN})

-add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python)
-
 add_subdirectory(api)
--- a/doc/v2/api/CMakeLists.txt
+++ b/doc/v2/api/CMakeLists.txt
@@ -7,6 +7,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")

+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
 configure_file(
    "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
    "${BINARY_BUILD_DIR_EN}/conf.py"

--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -19,8 +19,8 @@
 ----------------

 PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
-可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到，您也可以
-在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`_ 找到 paddle_manylinux_devel
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`__ 找到，您也可以
+在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__ 找到 paddle_manylinux_devel
 镜像的编译以及使用方法。或者参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。

 如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
@@ -35,7 +35,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
   docker build -t paddle:dev .
   # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev

@@ -116,11 +116,10 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安

  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行

-  ```emacs
-  (global-set-key "\C-cc" 'compile)
-  (setq compile-command
-   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-  ```
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")

  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。


--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -23,7 +23,7 @@ You need to use Docker to build PaddlePaddle
 to avoid installing dependencies by yourself. We have several pre-built
 Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ ,
 you can also find how to build and use paddle_manylinux_devel Docker image from
-`here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`_
+`here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__
 Or you can build your own image from source as the optional step below:

 .. code-block:: bash
@@ -34,7 +34,7 @@ Or you can build your own image from source as the optional step below:
   # 2. Optional: build development docker image from source
   docker build -t paddle:dev .
   # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev

@@ -88,7 +88,7 @@ If you wish to run only one unit test, like :code:`test_sum_op`:
 .. _faq_docker:

 Frequently Asked Questions
----------------
+---------------------------

 - What is Docker?

@@ -118,11 +118,10 @@ Frequently Asked Questions

  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:

-  ```emacs
-  (global-set-key "\C-cc" 'compile)
-  (setq compile-command
-   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-  ```
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")

  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.

@@ -145,7 +144,7 @@ Frequently Asked Questions
 .. _compile_deps:

 Appendix: Compile Dependencies
----------------
+-------------------------------

 PaddlePaddle need the following dependencies when compiling, other dependencies
 will be downloaded automatically.
@@ -166,11 +165,11 @@ will be downloaded automatically.
 .. _build_options:

 Appendix: Build Options
----------------
+-------------------------

 Build options include whether build binaries for CPU or GPU, which BLAS
 library to use etc. You may pass these settings when running cmake.
-For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`__ 。


 You can add :code:`-D` argument to pass such options, like:
@@ -219,7 +218,7 @@ keep on with latest cuDNN versions. Be sure to run with the same version of cuDN
 you built.

 Pass Compile Options
-++++++++++++++
++++++++++++++++++++++

 You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
 When running cmake command, it will search system paths like

--- a/doc/v2/build_and_install/docker_install_cn.rst
+++ b/doc/v2/build_and_install/docker_install_cn.rst
@@ -73,6 +73,7 @@
 当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：

  .. code-block:: bash
+
     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
     cd /work
     python train.py

--- a/doc/v2/build_and_install/docker_install_en.rst
+++ b/doc/v2/build_and_install/docker_install_en.rst
@@ -80,6 +80,7 @@ Also, you can go into the container shell, run or debug your code
 interactively:

  .. code-block:: bash
+
     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
     cd /work
     python train.py

--- a/doc/v2/build_and_install/index_cn.rst
+++ b/doc/v2/build_and_install/index_cn.rst
@@ -6,7 +6,7 @@
 PaddlePaddle针对不同的用户群体提供了多种安装方式。

 专注深度学习模型开发
-----------------
+--------------------

 PaddlePaddle提供了多种python wheel包，可通过pip一键安装：

@@ -18,7 +18,7 @@ PaddlePaddle提供了多种python wheel包，可通过pip一键安装：
 这是最便捷的安装方式，请根据机器配置和系统选择对应的安装包。

 关注底层框架
----------
+-------------

 PaddlePaddle提供了基于Docker的安装方式，请参照以下教程：

@@ -45,7 +45,7 @@ PaddlePaddle提供了基于Docker的安装方式，请参照以下教程：


 常见问题汇总
-----------
+--------------

 如果在安装过程中遇到了问题，请先尝试在下面的页面寻找答案：


--- a/doc/v2/build_and_install/index_en.rst
+++ b/doc/v2/build_and_install/index_en.rst
 install and Compile
-==========
+======================

 .. _install_steps:

 PaddlePaddle provides various methods of installation for many different users

 Focus on Deep Learning Model Development
-----------------
+----------------------------------------

 PaddlePaddle provides lots of packages of python wheel , that pip can install:

@@ -18,7 +18,7 @@ PaddlePaddle provides lots of packages of python wheel , that pip can install:
 This is the most convenient way of installation. Please choose the right installation package with machine configure and system.

 Follow the Bottom Frame
----------
+------------------------

 PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:


--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -55,11 +55,11 @@ paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版
    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
    :widths: 1, 3, 3

-    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"

 .. _pip_dependency:


--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -58,11 +58,11 @@ If the links below shows up the login form, just click "Log in as guest" to star
    :header: "version", "cp27-cp27mu", "cp27-cp27m"
    :widths: 1, 3, 3

-    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"

 .. _pip_dependency:


--- a/doc/v2/howto/capi/workflow_of_capi_cn.md
+++ b/doc/v2/howto/capi/workflow_of_capi_cn.md
@@ -59,7 +59,7 @@
    代码示例如下：

    ```python
-    from paddle.utils.merge_model import merge_v2_modelss
+    from paddle.utils.merge_model import merge_v2_model
    from mnist_v2 import network

    net = network(is_infer=True)

--- a/go/pserver/client/c/test/CMakeLists.txt
+++ b/go/pserver/client/c/test/CMakeLists.txt
@@ -13,4 +13,3 @@
 # limitations under the License.
 #
 cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
-add_style_check_target(test_cclient test_cclient.c)
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -33,9 +33,6 @@ add_library(paddle_capi STATIC ${CAPI_HEADERS} ${CAPI_PRIVATE_HEADER}

 target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})

-add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
-  ${CAPI_PRIVATE_HEADER})
-
 add_dependencies(paddle_capi paddle_proto paddle_gserver)

 # TODO: paddle_capi_whole will be removed.

--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -87,8 +87,3 @@ else()
 endif()

 add_dependencies(paddle_cuda paddle_proto ${external_project_dependencies})
-
-add_style_check_target(paddle_cuda
-                       ${CUDA_SOURCES}
-                       ${CUDA_HEADERS}
-                       ${CUDA_CXX_SOURCES})
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -36,5 +36,5 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha
        device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
        device_context gather_op_handle)
-cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
-        device_context reduce_op_handle )
+#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+#        device_context reduce_op_handle )
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
 #include "paddle/fluid/framework/details/send_op_handle.h"
+#include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"

 #ifdef PADDLE_WITH_CUDA
@@ -159,25 +160,39 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
      if (!is_forwarding && places_.size() > 1) {
        // Currently, we assume that once gradient is generated, it can be
        // broadcast, and each gradient is only broadcast once.
-        for (auto &og : op->OutputArgumentNames()) {
-          if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
-            switch (strategy_.reduce_) {
-              case BuildStrategy::ReduceStrategy::kReduce:
-                CreateReduceOp(&result, og, cur_device_id);
-                var_name_on_devices[cur_device_id].emplace(og);
-                bcast_var_name_set[cur_device_id].emplace(
-                    og.substr(0, og.size() - strlen(kGradVarSuffix)));
-                cur_device_id = (cur_device_id + 1) % places_.size();
-                break;
-              case BuildStrategy::ReduceStrategy::kAllReduce:
-                if (IsSparseGradient(var_types, og)) {
-                  CreateReduceOp(&result, og, 0);
-                  CreateBroadcastOp(&result, og, 0);
-                } else {
-                  InsertNCCLAllReduceOp(&result, og);
-                }
-                break;
+        if (static_cast<bool>(boost::get<int>(op->GetAttr(
+                                  OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                              static_cast<int>(OpRole::kBackward))) {
+          try {
+            auto backward_vars =
+                boost::get<std::vector<std::string>>(op->GetNullableAttr(
+                    OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+            PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+            for (size_t i = 0; i < backward_vars.size(); i += 2) {
+              auto &p_name = backward_vars[i];
+              auto &g_name = backward_vars[i + 1];
+              VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+
+              switch (strategy_.reduce_) {
+                case BuildStrategy::ReduceStrategy::kReduce:
+                  CreateReduceOp(&result, g_name, cur_device_id);
+                  var_name_on_devices[cur_device_id].emplace(g_name);
+                  bcast_var_name_set[cur_device_id].emplace(p_name);
+                  cur_device_id = (cur_device_id + 1) % places_.size();
+                  break;
+                case BuildStrategy::ReduceStrategy::kAllReduce:
+                  if (IsSparseGradient(var_types, g_name)) {
+                    CreateReduceOp(&result, g_name, 0);
+                    CreateBroadcastOp(&result, g_name, 0);
+                  } else {
+                    InsertNCCLAllReduceOp(&result, g_name);
+                  }
+                  break;
+              }
            }
+          } catch (boost::bad_get e) {
          }
        }
      }
@@ -398,11 +413,12 @@ void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
 }

 bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
-  // FIXME(yy): Do not hard code like this
-  return op.OutputArgumentNames().size() == 1 &&
-         op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
+  return boost::get<int>(
+             op.GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+             (static_cast<int>(OpRole::kBackward) |
+              static_cast<int>(OpRole::kLoss)) &&
+         !loss_var_name_.empty();  // If loss_var is empty. This is test mode
 }
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -96,10 +96,7 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
    info->proto_ = new proto::OpProto;
    info->checker_ = new OpAttrChecker();
    T maker;
-    maker.SetProto(info->proto_);
-    maker.SetChecker(info->checker_);
-    maker.Make();
-    maker.Validate();
+    maker(info->proto_, info->checker_);
    info->proto_->set_type(op_type);
    PADDLE_ENFORCE(
        info->proto_->IsInitialized(),

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <unordered_map>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/shape_inference.h"
@@ -222,6 +223,15 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
  return it->second;
 }

+Attribute OpDesc::GetNullableAttr(const std::string &name) const {
+  auto it = attrs_.find(name);
+  if (it != attrs_.end()) {
+    return it->second;
+  } else {
+    return Attribute();
+  }
+}
+
 int OpDesc::GetBlockAttr(const std::string &name) const {
  auto it = attrs_.find(name);
  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
@@ -249,6 +259,13 @@ void OpDesc::RenameOutput(const std::string &old_name,
    std::replace(output.second.begin(), output.second.end(), old_name,
                 new_name);
  }
+
+  auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
+  if (it != attrs_.end()) {
+    auto &op_vars = boost::get<std::vector<std::string>>(it->second);
+    std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
+  }
+
  need_update_ = true;
 }


--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -78,6 +78,8 @@ class OpDesc {

  Attribute GetAttr(const std::string &name) const;

+  Attribute GetNullableAttr(const std::string &name) const;
+
  int GetBlockAttr(const std::string &name) const;

  void Rename(const std::string &old_name, const std::string &new_name);

--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -13,6 +13,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_proto_maker.h"
 #include <string>
+#include <vector>

 namespace paddle {
 namespace framework {
@@ -55,5 +56,28 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
  }
 }

+void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
+                                        OpAttrChecker* attr_checker) {
+  proto_ = proto;
+  op_checker_ = attr_checker;
+  Make();
+
+  AddAttr<int>(OpRoleAttrName(), "The role of this operator")
+      .InEnum(
+          {static_cast<int>(OpRole::kForward),
+           static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kOptimize),
+           static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
+           static_cast<int>(OpRole::kLoss) |
+               static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kNotSpecified)})
+      .SetDefault(static_cast<int>(OpRole::kNotSpecified));
+  AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),
+                                    "Optimized for variable")
+      .SetDefault({});
+
+  Validate();
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -20,21 +20,31 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+enum class OpRole {
+  kForward = 0x0000,
+  kBackward = 0x0001,
+  kOptimize = 0x0002,
+
+  kLoss = 0x0100,
+  // The default value of op's role. This should be only used for unittests and
+  // CreateOp inside a operator.
+  kNotSpecified = 0x1000,
+};
+
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
 public:
+  static const char *OpRoleAttrName() { return "op_role"; }
+  static const char *OpRoleVarAttrName() { return "op_role_var"; }
+
+  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
+
  virtual void Make() = 0;

  virtual ~OpProtoAndCheckerMaker() {
    CHECK(validated_) << "should call Validate after build";
  }

-  void SetProto(proto::OpProto *proto) { proto_ = proto; }
-
-  void SetChecker(OpAttrChecker *attr_checker) { op_checker_ = attr_checker; }
-
-  void Validate();
-
 protected:
  struct VariableBuilder {
    proto::OpProto::Var *var_;
@@ -76,6 +86,7 @@ class OpProtoAndCheckerMaker {

 private:
  void CheckNoDuplicatedInOutAttrs();
+  void Validate();

  proto::OpProto *proto_;
  OpAttrChecker *op_checker_;

--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -28,10 +28,8 @@ TEST(ProtoMaker, DuplicatedAttr) {
  paddle::framework::proto::OpProto op_proto;
  paddle::framework::OpAttrChecker op_checker;
  TestAttrProtoMaker proto_maker;
-  proto_maker.SetProto(&op_proto);
-  proto_maker.SetChecker(&op_checker);
-  proto_maker.Make();
-  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+               paddle::platform::EnforceNotMet);
 }

 class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
@@ -46,8 +44,6 @@ TEST(ProtoMaker, DuplicatedInOut) {
  paddle::framework::proto::OpProto op_proto;
  paddle::framework::OpAttrChecker op_checker;
  TestAttrProtoMaker proto_maker;
-  proto_maker.SetProto(&op_proto);
-  proto_maker.SetChecker(&op_checker);
-  proto_maker.Make();
-  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+               paddle::platform::EnforceNotMet);
 }
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
-cc_library(analysis SRCS dot.cc node.cc node.h)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
+cc_library(analysis SRCS dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc fluid_to_data_flow_graph_pass.cc
+  DEPS paddle_fluid)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
+cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
+
+set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+
+cc_test(test_data_flow_graph SRCS data_flow_graph_tester.cc DEPS analysis ${FLUID_CORE_MODULES} paddle_fluid
+  ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
+set_tests_properties(test_data_flow_graph PROPERTIES DEPENDS test_word2vec)
+
+cc_test(test_subgraph_splitter
+        SRCS subgraph_splitter_tester.cc
+        DEPS analysis paddle_fluid tensor
+        ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
+set_tests_properties(test_subgraph_splitter PROPERTIES DEPENDS test_word2vec)
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/dot.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+// It is a better idea that the inputs and outputs of this graph is set manully
+// before, but there must be a Pass that helps to prune the unnecessary ops that
+// do not contribute to the given targets, so in this pass, analysis and get the
+// inputs and outputs is OK.
+void DataFlowGraph::Build() {
+  inputs.clear();
+  outputs.clear();
+  std::unordered_set<Node *> ins;
+  std::unordered_set<Node *> outs;
+  for (auto &node : nodes.nodes()) {
+    for (auto *in : node->inlinks) {
+      ins.insert(in);
+    }
+    for (auto *out : node->outlinks) {
+      outs.insert(out);
+    }
+  }
+
+  // The nodes that in ins but not in outs is the graph's inputs
+  // similarly, the nodes that in outs but not in ins is the graphs' outputs
+  for (auto *in : ins) {
+    if (!outs.count(in)) {
+      inputs.push_back(in);
+    }
+  }
+  for (auto *out : outs) {
+    if (!outs.count(out)) {
+      outputs.push_back(out);
+    }
+  }
+}
+
+std::string DataFlowGraph::DotString() const {
+  Dot dot;
+
+  // Add nodes
+  for (size_t i = 0; i < nodes.size(); i++) {
+    const Node &node = nodes.Get(i);
+    switch (node.type()) {
+      case Node::Type::kValue:
+        dot.AddNode(node.repr(), node.dot_attrs());
+        break;
+      case Node::Type::kFunction:
+        dot.AddNode(node.repr(), node.dot_attrs());
+        break;
+      case Node::Type::kFunctionBlock:
+        dot.AddNode(node.repr(), node.dot_attrs());
+        break;
+      default:
+        PADDLE_THROW("unsupported Node type %d", static_cast<int>(node.type()));
+    }
+  }
+
+  // Add edges
+  for (size_t i = 0; i < nodes.size(); i++) {
+    const Node &node = nodes.Get(i);
+    for (auto &in : node.inlinks) {
+      dot.AddEdge(in->repr(), node.repr(), {});
+    }
+  }
+  return dot.Build();
+}
+
+//
+// NodesBFSIterator
+//
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+    const std::vector<Node *> &source)
+    : queue_(source.begin(), source.end()) {}
+
+// GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+//     GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
+//     : queue_(std::move(other.queue_)),
+//       visited_(std::move(other.visited_)) {}
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other)
+    : queue_(other.queue_), visited_(other.visited_) {}
+
+Node &GraphTraits<DataFlowGraph>::NodesBFSIterator::operator*() {
+  PADDLE_ENFORCE(!queue_.empty());
+  return *queue_.front();
+}
+
+Node *GraphTraits<DataFlowGraph>::NodesBFSIterator::operator->() {
+  PADDLE_ENFORCE(!queue_.empty());
+  return queue_.front();
+}
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator &
+GraphTraits<DataFlowGraph>::NodesBFSIterator::operator=(
+    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other) {
+  queue_ = other.queue_;
+  visited_ = other.visited_;
+  return *this;
+}
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator
+    &GraphTraits<DataFlowGraph>::NodesBFSIterator::operator++() {
+  PADDLE_ENFORCE(!queue_.empty());
+  auto *cur = queue_.front();
+  visited_.insert(cur);
+  queue_.pop_front();
+  for (auto *output : cur->outlinks) {
+    if (!visited_.count(output)) {
+      queue_.push_back(output);
+      visited_.insert(output);
+    }
+  }
+  return *this;
+}
+
+bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
+    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other) {
+  if (queue_.empty()) return other.queue_.empty();
+  if ((!queue_.empty()) && (!other.queue_.empty())) {
+    return queue_.front() == other.queue_.front() &&
+           visited_.size() == other.visited_.size();  // here need to check the
+                                                      // equality of queue and
+    // visited. Just a light but week implementation.
+  }
+  return false;
+}
+
+//
+// NodesDFSIterator
+//
+GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+    const std::vector<Node *> &source) {
+  for (auto *x : source) stack_.push(x);
+}
+
+// GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+//     GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
+//     : stack_(std::move(other.stack_)),
+//       visited_(std::move(other.visited_)) {}
+
+GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other)
+    : stack_(other.stack_), visited_(other.visited_) {}
+
+Node &GraphTraits<DataFlowGraph>::NodesDFSIterator::operator*() {
+  PADDLE_ENFORCE(!stack_.empty());
+  return *stack_.top();
+}
+
+GraphTraits<DataFlowGraph>::NodesDFSIterator
+    &GraphTraits<DataFlowGraph>::NodesDFSIterator::operator++() {
+  if (stack_.empty()) return *this;
+  visited_.insert(stack_.top());
+  auto *cur = stack_.top();
+  stack_.pop();
+  for (auto *x : cur->outlinks) {
+    if (!visited_.count(x)) {
+      stack_.push(x);
+      visited_.insert(x);
+    }
+  }
+  return *this;
+}
+bool GraphTraits<DataFlowGraph>::NodesDFSIterator::operator==(
+    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other) {
+  if (stack_.empty()) return other.stack_.empty();
+  if ((!stack_.empty()) && (!other.stack_.empty())) {
+    return stack_.top() == other.stack_.top();
+  }
+  return false;
+}
+
+GraphTraits<DataFlowGraph>::NodesDFSIterator &
+GraphTraits<DataFlowGraph>::NodesDFSIterator::operator=(
+    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other) {
+  stack_ = other.stack_;
+  visited_ = other.visited_;
+  return *this;
+}
+Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
+  return stack_.top();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * Data flow graph is an pass that build the basic graph. It contains a graph
+ * and the iterators that enable the iteration over the graph.
+ */
+
+#pragma once
+
+#include <deque>
+#include <stack>
+#include <unordered_set>
+
+#include "paddle/fluid/inference/analysis/graph_traits.h"
+#include "paddle/fluid/inference/analysis/node.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * DataFlowGraph - A container of Value and Function Nodes.
+ */
+struct DataFlowGraph {
+  NodeMap nodes;
+  std::vector<Node *> inputs;
+  std::vector<Node *> outputs;
+
+  // Extract inputs and outputs of the graph.
+  void Build();
+
+  // Output a DOT graph file for debug.
+  std::string DotString() const;
+};
+
+/*
+ * An graph trait help to traverse the graph using BFS.
+ * The BFS start from a graph's inputs, the graph should be fully-connected, so
+ * that the iterator can reach the end.
+ */
+template <>
+struct GraphTraits<DataFlowGraph> {
+  // BFS iterator on nodes.
+  struct NodesBFSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesBFSIterator() = default;
+    explicit NodesBFSIterator(const std::vector<Node *> &source);
+    // NodesBFSIterator(NodesBFSIterator &&other) noexcept;
+    // NOTE Heavy to use.
+    NodesBFSIterator(const NodesBFSIterator &other);
+
+    Node &operator*();
+    NodesBFSIterator &operator++();
+    Node *operator->();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesBFSIterator &operator=(const NodesBFSIterator &other);
+    bool operator==(const NodesBFSIterator &other);
+    bool operator!=(const NodesBFSIterator &other) { return !(*this == other); }
+
+   private:
+    std::deque<Node *> queue_;
+    std::unordered_set<Node *> visited_;
+  };
+
+  // DFS iterator on nodes.
+  struct NodesDFSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesDFSIterator() = default;
+    explicit NodesDFSIterator(const std::vector<Node *> &source);
+    // NodesDFSIterator(NodesDFSIterator &&other) noexcept;
+    NodesDFSIterator(const NodesDFSIterator &other);
+
+    Node &operator*();
+    NodesDFSIterator &operator++();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesDFSIterator &operator=(const NodesDFSIterator &other);
+    bool operator==(const NodesDFSIterator &other);
+    bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
+    Node *operator->();
+
+   private:
+    std::stack<Node *> stack_;
+    std::unordered_set<Node *> visited_;
+  };
+
+  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
+
+  // default use BFS to visit the nodes.
+  iterator_range<NodesBFSIterator> nodes() {
+    return iterator_range<NodesBFSIterator>(nodes_bfs_begin(), nodes_bfs_end());
+  }
+  iterator_range<NodesBFSIterator> nodes_in_BFS() {
+    return iterator_range<NodesBFSIterator>(nodes_bfs_begin(), nodes_bfs_end());
+  }
+  iterator_range<NodesDFSIterator> nodes_in_DFS() {
+    return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
+  }
+
+ private:
+  NodesBFSIterator nodes_bfs_begin() {
+    return NodesBFSIterator(graph_->inputs);
+  }
+  NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
+  NodesDFSIterator nodes_dfs_begin() {
+    return NodesDFSIterator(graph_->inputs);
+  }
+  NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
+
+ private:
+  DataFlowGraph *graph_;
+};
+
+// Extract the inputs and outputs of a graph. The inputs and outputs of a
+// sub-graph is the inputs nodes and output nodes that doesn't inside the
+// sub-graph.
+std::pair<
+    std::vector<Node *>,
+    std::vector<
+        Node *>> static ExtractInputAndOutputOfSubGraph(std::vector<Node *>
+                                                            &graph) {
+  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
+  std::unordered_set<Node *> inputs;
+  std::unordered_set<Node *> outputs;
+  for (auto &node : graph) {
+    for (auto *in : node->inlinks) {
+      if (!nodes.count(in) && in->type() == Node::Type::kValue) {
+        inputs.insert(in);
+      }
+    }
+    for (auto *out : node->outlinks) {
+      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
+        outputs.insert(out);
+      }
+    }
+  }
+  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
+                        std::vector<Node *>(outputs.begin(), outputs.end()));
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(DataFlowGraph, BFS) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+  dfg.Build();
+
+  for (auto* in : dfg.inputs) {
+    LOG(INFO) << "inputs: " << in->name() << " "
+              << static_cast<int>(in->type());
+  }
+  for (auto* out : dfg.outputs) {
+    LOG(INFO) << "outputs: " << out->name() << " "
+              << static_cast<int>(out->type());
+  }
+
+  GraphTraits<DataFlowGraph> trait(&dfg);
+  auto nodes = trait.nodes();
+  int count = 0;
+  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
+    LOG(INFO) << "visiting " << it->name();
+    ++count;
+  }
+  ASSERT_EQ(count, dfg.nodes.size());
+}
+
+TEST(DataFlowGraph, DFS) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+  dfg.Build();
+  GraphTraits<DataFlowGraph> trait(&dfg);
+  auto nodes = trait.nodes_in_DFS();
+  int count = 0;
+  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
+    LOG(INFO) << "visiting " << it->name();
+    ++count;
+  }
+  ASSERT_EQ(count, dfg.nodes.size());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+
+#include <glog/logging.h>
+#include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/io.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, Test) {
+  framework::proto::ProgramDesc new_desc;
+  DataFlowGraph graph;
+
+  FluidToDataFlowGraphPass pass0;
+  DataFlowGraphToFluidPass pass1;
+  pass0.Initialize(desc);
+  pass1.Initialize(&new_desc);
+
+  pass0.Run(&graph);
+  pass1.Run(&graph);
+
+  pass0.Finalize();
+  pass1.Finalize();
+
+  LOG(INFO) << graph.nodes.size();
+}
+
+}  // analysis
+}  // inference
+}  // paddle
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include <vector>
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+FluidToDataFlowGraphPass::FluidToDataFlowGraphPass() {}
+
+bool FluidToDataFlowGraphPass::Initialize() { return Pass::Initialize(); }
+
+bool FluidToDataFlowGraphPass::Initialize(
+    const framework::proto::ProgramDesc &desc) {
+  desc_ = &desc;
+  return true;
+}
+
+bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); }
+
+void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
+  // insert vars
+  std::unordered_map<std::string, size_t> var2id;
+  auto &main_block = desc_->blocks(framework::kRootBlockIndex);
+  for (int i = 0; i < main_block.vars_size(); i++) {
+    const auto &var = main_block.vars(i);
+    auto *v = graph->nodes.Create(Node::Type::kValue);
+    v->SetName(var.name());
+    v->SetExtraInfo(const_cast<void *>(static_cast<const void *>(&var)));
+    var2id[var.name()] = v->id();
+  }
+  for (int i = 0; i < main_block.ops_size(); i++) {
+    const auto &op = main_block.ops(i);
+    auto *o = graph->nodes.Create(Node::Type::kFunction);
+    o->SetName(op.type());
+    static_cast<Function *>(o)->SetFuncType(op.type());
+    // Link to the original protobuf message's memory, make it easier to
+    // generate from a data flow graph to fluid ProgramDesc.
+    o->SetExtraInfo(const_cast<void *>(static_cast<const void *>(&op)));
+    // set inputs and outputs
+    // TODO(Superjomn) make sure the InputNames is the real variable name.
+    for (int j = 0; j < op.inputs_size(); j++) {
+      auto &in_var = op.inputs(j);
+      for (int k = 0; k < in_var.arguments_size(); k++) {
+        auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
+        in->outlinks.push_back(o);
+        o->inlinks.push_back(in);
+      }
+    }
+    for (int j = 0; j < op.outputs_size(); j++) {
+      auto &out_var = op.outputs(j);
+      for (int k = 0; k < out_var.arguments_size(); k++) {
+        auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
+        out->inlinks.push_back(o);
+        o->outlinks.push_back(out);
+      }
+    }
+  }
+  // Analysis and extract the inputs and outputs of this graph.
+  graph->Build();
+}
+
+Pass *FluidToDataFlowGraphPass::CreatePrinterPass(
+    std::ostream &os, const std::string &banner) const {
+  return nullptr;
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+/*
+ * This file implements the transformation from data flow graph to fluid
+ * ProgramDesc.
+ */
+
+#pragma once
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Transform a FluidDesc to a data flow graph.
+ */
+class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
+ public:
+  FluidToDataFlowGraphPass();
+  bool Initialize() override;
+  bool Initialize(const framework::proto::ProgramDesc &desc) override;
+  bool Finalize() override;
+
+  void Run(DataFlowGraph *graph) override;
+
+  Pass *CreatePrinterPass(std::ostream &os,
+                          const std::string &banner) const override;
+
+ private:
+  framework::proto::ProgramDesc const *desc_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, Init) {
+  FluidToDataFlowGraphPass pass;
+  pass.Initialize();
+  pass.Initialize(desc);
+  DataFlowGraph graph;
+  pass.Run(&graph);
+  ASSERT_GT(graph.nodes.size(), 0);
+  pass.Finalize();
+  LOG(INFO) << '\n' << graph.DotString();
+}
+
+}  // analysis
+}  // inference
+}  // paddle
--- a/paddle/fluid/inference/analysis/graph_traits.cc
+++ b/paddle/fluid/inference/analysis/graph_traits.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/graph_traits.h"
--- a/paddle/fluid/inference/analysis/graph_traits.h
+++ b/paddle/fluid/inference/analysis/graph_traits.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the GraphTraits<X> template class that should be specified
+ * by classes that want to be iteratable by generic graph iterators.
+ *
+ * This file also defines the marker class Inverse that is used to iterate over
+ * graphs in a graph defined, inverse ordering...
+ */
+
+#pragma once
+
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * This class should be specialized by different graph types...
+ * That's why the base class is empty.
+ */
+template <typename GraphType>
+struct GraphTraits {
+  // using NodesBFSIterator = xxx
+
+  // NodesBFSIterator nodes_begin();
+  // NodesBFSIterator nodes_end();
+};
+
+/*
+ * Inverse - This class is used as a marker class to tell the graph iterator to
+ * iterate in a graph defined Inverse order.
+ */
+template <typename GraphType>
+struct Inverse {
+  const GraphType &graph;
+
+  explicit Inverse(const GraphType &graph) : graph(graph) {}
+};
+
+/*
+ * Provide a partial specialization of GraphTraits so that the inverse of an
+ * inverse turns into the original graph.
+ */
+template <typename GraphType>
+struct GraphTraits<Inverse<Inverse<GraphType>>> : GraphTraits<GraphType> {};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-template <typename IteratorT>
-class iterator_range {
-  IteratorT begin_, end_;
-
- public:
-  template <typename Container>
-  explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
-
-  iterator_range(const IteratorT &begin, const IteratorT &end)
-      : begin_(begin), end_(end) {}
-
-  const IteratorT &begin() const { return begin_; }
-  const IteratorT &end() const { return end_; }
-};
-
-/*
- * An registry helper class, with its records keeps the order they registers.
- */
-template <typename T>
-class OrderedRegistry {
- public:
-  T *Register(const std::string &name, T *x) {
-    PADDLE_ENFORCE(!dic_.count(name));
-    dic_[name] = data_.size();
-    data_.emplace_back(std::unique_ptr<T>(x));
-    return data_.back().get();
-  }
-
-  T *Lookup(const std::string &name) {
-    auto it = dic_.find(name);
-    if (it == dic_.end()) return nullptr;
-    return data_[it->second].get();
-  }
-
- protected:
-  std::unordered_map<std::string, int> dic_;
-  std::vector<std::unique_ptr<T>> data_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
-
-#define PADDLE_DISALLOW_COPY_AND_ASSIGN(type__) \
-                                                \
-  type__(const type__ &) = delete;              \
-                                                \
-  void operator=(const type__ &) = delete;
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+#define SET_TYPE(type__) dic_[typeid(type__).hash_code()] = #type__;
+/*
+ * Map typeid to representation.
+ */
+struct DataTypeNamer {
+  static const DataTypeNamer &Global() {
+    static auto *x = new DataTypeNamer();
+    return *x;
+  }
+
+  template <typename T>
+  const std::string &repr() const {
+    auto x = typeid(T).hash_code();
+    PADDLE_ENFORCE(dic_.count(x), "unknown type for representation");
+    return dic_.at(x);
+  }
+
+  const std::string &repr(size_t &hash) const {
+    PADDLE_ENFORCE(dic_.count(hash), "unknown type for representation");
+    return dic_.at(hash);
+  }
+
+ private:
+  DataTypeNamer() {
+    SET_TYPE(int);
+    SET_TYPE(bool);
+    SET_TYPE(float);
+  }
+
+  std::unordered_map<decltype(typeid(int).hash_code()), std::string> dic_;
+};
+#undef SET_TYPE
+
+template <typename IteratorT>
+class iterator_range {
+  IteratorT begin_, end_;
+
+ public:
+  template <typename Container>
+  explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
+
+  iterator_range(const IteratorT &begin, const IteratorT &end)
+      : begin_(begin), end_(end) {}
+
+  const IteratorT &begin() const { return begin_; }
+  const IteratorT &end() const { return end_; }
+};
+
+/*
+ * An registry helper class, with its records keeps the order they registers.
+ */
+template <typename T>
+class OrderedRegistry {
+ public:
+  T *Register(const std::string &name, T *x) {
+    PADDLE_ENFORCE(!dic_.count(name));
+    dic_[name] = data_.size();
+    data_.emplace_back(std::unique_ptr<T>(x));
+    return data_.back().get();
+  }
+
+  T *Lookup(const std::string &name) {
+    auto it = dic_.find(name);
+    if (it == dic_.end()) return nullptr;
+    return data_[it->second].get();
+  }
+
+ protected:
+  std::unordered_map<std::string, int> dic_;
+  std::vector<std::unique_ptr<T>> data_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+#define PADDLE_DISALLOW_COPY_AND_ASSIGN(type__) \
+                                                \
+  type__(const type__ &) = delete;              \
+                                                \
+  void operator=(const type__ &) = delete;
--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -117,7 +117,10 @@ class Node {
        type_hash_ = typeid(T).hash_code();
        data_.resize(sizeof(T));
      }
-      PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(), "type not matched");
+      PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
+                     "type not matched, origin is %s, want %s",
+                     DataTypeNamer::Global().repr(type_hash_),
+                     DataTypeNamer::Global().repr<T>());
      PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
      return *reinterpret_cast<T *>(&data_[0]);
    }
@@ -127,6 +130,10 @@ class Node {
    size_t type_hash_{std::numeric_limits<size_t>::max()};
  };

+  bool IsFunction() const { return type_ == Node::Type::kFunction; }
+  bool IsValue() const { return type_ == Node::Type::kValue; }
+  bool IsFunctionBlock() const { return type_ == Node::Type::kFunctionBlock; }
+
  virtual ~Node() {}

  friend class NodeMap;

--- a/paddle/fluid/inference/analysis/node_tester.cc
+++ b/paddle/fluid/inference/analysis/node_tester.cc
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at

-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0

-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #include "paddle/fluid/inference/analysis/node.h"


--- a/paddle/fluid/inference/analysis/pass.cc
+++ b/paddle/fluid/inference/analysis/pass.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/pass.h"
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <iosfwd>
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/node.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class Pass {
+ public:
+  Pass() = default;
+  virtual ~Pass() {}
+  // Virtual method overridden by subclasses to do only necessary initialization
+  // before any pass is run.
+  virtual bool Initialize() { return false; }
+  // There is some passes such as FlowToDataFlowGraphPass that needs a
+  // ProgramDesc. Here use the native ProgramDesc ProtoBuf message, so that it
+  // only couple with the proto file.
+  virtual bool Initialize(const framework::proto::ProgramDesc &desc) {
+    return false;
+  }
+  // There are some Passes such as DataFlowGraphToFluidPass that will output a
+  // ProgramDesc.
+  virtual bool Initialize(framework::proto::ProgramDesc *desc) { return false; }
+
+  // Virtual method overriden by subclasses to do any necessary clean up after
+  // all passes have run.
+  virtual bool Finalize() { return false; }
+
+  // Get a Pass appropriate to print the Node this pass operates on.
+  virtual Pass *CreatePrinterPass(std::ostream &os,
+                                  const std::string &banner) const = 0;
+
+  // Run on a single Node.
+  virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
+  // Run on a single Function.
+  virtual void Run(Function *x) { LOG(FATAL) << "not valid"; }
+  // Run on a single FunctionBlock.
+  virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
+  // Run on a single DataFlowGraph.
+  virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
+};
+
+// NodePass process on any Node types.
+class NodePass : public Pass {
+ public:
+  virtual void Run(Node *node) = 0;
+};
+
+// NodePass process on any Function node types.
+class FunctionPass : public Pass {
+ public:
+  virtual void Run(Function *node) = 0;
+};
+
+// NodePass process on any FunctionBlock node types.
+class FunctionBlockPass : public Pass {
+ public:
+  virtual void Run(FunctionBlock *node) = 0;
+};
+
+// GraphPass processes on any GraphType.
+class DataFlowGraphPass : public Pass {
+ public:
+  virtual void Run(DataFlowGraph *graph) = 0;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+const char *SubGraphSplitter::kMarkerAttrName =
+    "_sub_graph_splitter_inside_sub_graph";
+
+std::vector<std::vector<Node *>> SubGraphSplitter::operator()() {
+  MarkNodesInsideSubGraph();
+  return ExtractSubGraphs();
+}
+
+// Mark the output variables inside a subgraph with the func.
+inline void MarkOutLinksInSubGraph(const Function *func) {
+  for (auto *var : func->outlinks) {
+    var->attr(SubGraphSplitter::kMarkerAttrName).Bool() = true;
+  }
+}
+
+void SubGraphSplitter::MarkNodesInsideSubGraph() {
+  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+    if (node_inside_subgraph_teller_(&node)) {
+      node.attr(kMarkerAttrName).Bool() = true;
+      if (node.type() == Node::Type::kFunction) {
+        // If a function is inside the sub-graph, mark all the output variables
+        // to be inside too, so that two marked functions will be inside a same
+        // sub-graph, lets take a example:  A_function->var->B_function, if
+        // A_function is marked, var should also be marked, so that B_function
+        // will be in the same sub-graph with A_function if B_function is
+        // marked.
+        MarkOutLinksInSubGraph(static_cast<const Function *>(&node));
+      }
+    }
+  }
+}
+
+const char *kUnionFindParent = "_sub_graph_splitter_union_find_parent_";
+
+// Use the Union Find(UF) algorithm to find fully connected sub-graphs, if node
+// a's output is node b, that is a and b is in the same sub-graph. The UF
+// algorithm will group them to the same cluster.
+using node_map_t = std::unordered_map<int, Node *>;
+// Find the ancestor id of a node.
+int UnionFindGetAncestor(const node_map_t &node_map, size_t id) {
+  int tmp = id;
+  do {
+    tmp = node_map.at(tmp)->attr(kUnionFindParent).Int32();
+  } while (node_map.at(tmp)->attr(kUnionFindParent).Int32() != tmp);
+  return tmp;
+}
+// Make this two node share the same ancestor.
+// TODO(Superjom) bad performance, make a balanced tree latter.
+void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
+  int a_ancestor = UnionFindGetAncestor(node_map, a);
+  int b_ancestor = UnionFindGetAncestor(node_map, b);
+  node_map.at(b_ancestor)->attr(kUnionFindParent).Int32() = a_ancestor;
+  node_map.at(a)->attr(kUnionFindParent).Int32() = a_ancestor;
+  node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor;
+}
+
+std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
+  std::vector<Node *> marked_nodes;
+  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+    if (node.attr(kMarkerAttrName).Bool()) {
+      marked_nodes.push_back(&node);
+    }
+  }
+  // extract sub-graphs in the marked node set, use Union Find algorithm.
+  node_map_t node_map;  // id to ptr
+  for (auto *n : marked_nodes) {
+    // n's parent == n.id means it is the ancestor
+    n->attr(kUnionFindParent).Int32() = n->id();
+    node_map[n->id()] = n;
+  }
+  std::unordered_set<Node *> visited;
+  for (auto *n : marked_nodes) {
+    for (auto *out : n->outlinks) {
+      if (node_map.count(out->id())) {
+        UnionFindCombine(node_map, n->id(), out->id());
+      }
+    }
+  }
+
+  std::unordered_map<int /*ancestor*/, std::vector<Node *>> clusters;
+  for (auto *n : marked_nodes) {
+    if (n->type() == Node::Type::kFunction) {
+      clusters[UnionFindGetAncestor(node_map,
+                                    n->attr(kUnionFindParent).Int32())]
+          .push_back(n);
+    }
+  }
+  std::vector<std::vector<Node *>> result;
+  std::for_each(clusters.begin(), clusters.end(),
+                [&](const decltype(clusters)::value_type &it) {
+                  result.push_back(it.second);
+                });
+
+  return result;
+}
+
+void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
+
+void SubGraphFuse::ReplaceNodesWithSubGraphs() {
+  auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
+  for (auto &subgraph : subgraphs) {
+    // replace this sub-graph with the first node. Two steps: 1. Create a Block
+    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
+    // as deleted. 3. Replace the deleted node with the new Block Node.
+    auto *block_node = graph_->nodes.Create(Node::Type::kFunctionBlock);
+    auto io = ExtractInputAndOutputOfSubGraph(subgraph);
+    block_node->inlinks = std::move(io.first);
+    block_node->outlinks = std::move(io.second);
+    for (auto *node : subgraph) {
+      // TODO(Superjomn) need a unified mechanism to treat deleted node in each
+      // pass.
+      node->SetDeleted();
+    }
+
+    std::unordered_map<Node *, Node *>
+        delelte_node_map;  // deleted node to BlockNode
+    for (auto *n : block_node->inlinks) {
+      n->inlinks.clear();
+    }
+    for (auto *n : block_node->outlinks) {
+      n->outlinks.clear();
+    }
+    for (auto *n : block_node->inlinks) {
+      n->outlinks.push_back(block_node);
+    }
+    for (auto *n : block_node->outlinks) {
+      n->inlinks.push_back(n);
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the the class to partition a graph.
+ */
+
+#pragma once
+
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/node.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Detect the nodes in a sub-graph that meet some conditions. This class doesn't
+ * modify the graph.
+ */
+class SubGraphSplitter {
+ public:
+  static const char *kMarkerAttrName;
+  // Tell whether a node is inside a sub-graph.
+  using NodeInsideSubgraphTeller = std::function<bool(const Node *)>;
+
+  SubGraphSplitter(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
+      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+
+  std::vector<std::vector<Node *>> operator()();
+
+ protected:
+  // Mark the nodes inside the accepted sub-graph using
+  // node_inside_subgraph_teller.
+  void MarkNodesInsideSubGraph();
+
+  // Merge the marked nodes into sub-graphs and return the sub-graphs.
+  std::vector<std::vector<Node *>> ExtractSubGraphs();
+
+ private:
+  DataFlowGraph *graph_;
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+/*
+ * SubGraphFuse - Replace some nodes with the sub-graph node they are inside. To
+ * some extent, the TensorRT engine is just a fusion op for a model.
+ */
+class SubGraphFuse {
+ public:
+  using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;
+
+  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
+      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+
+  // The main method which run all the logic.
+  void operator()();
+
+ protected:
+  // Remove the nodes inside sub-graphs and replace with the SubGraphNode.
+  void ReplaceNodesWithSubGraphs();
+
+ private:
+  DataFlowGraph *graph_;
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, Split) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+  LOG(INFO) << "spliter\n" << dfg.DotString();
+
+  SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
+    if (node->type() != Node::Type::kFunction) return false;
+    const auto* func = static_cast<const Function*>(node);
+    if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
+        func->func_type() == "conv2d" || func->func_type() == "mul" ||
+        func->func_type() == "sigmoid" || func->func_type() == "softmax") {
+      LOG(INFO) << "sub-graph marked " << node->repr();
+      return true;
+    }
+    return false;
+  };
+  ASSERT_GT(dfg.nodes.size(), 5UL);
+
+  auto subgraphs = SubGraphSplitter(&dfg, teller)();
+
+  // Check the number of the marked nodes.
+  int marked_nodes = 0;
+  for (auto& node : dfg.nodes.nodes()) {
+    if (node->IsFunction() &&
+        node->attr(SubGraphSplitter::kMarkerAttrName).Bool()) {
+      ++marked_nodes;
+    }
+  }
+  EXPECT_EQ(marked_nodes, 6);
+
+  // For human debug.
+  for (auto& subgraph : subgraphs) {
+    LOG(INFO) << "subgraph size " << subgraph.size();
+    for (auto* node : subgraph) {
+      LOG(INFO) << "node " << node->repr();
+    }
+  }
+
+  ASSERT_EQ(subgraphs.size(), 1UL);
+  // The last sub-graph has 5 Functions.
+  ASSERT_EQ(subgraphs.back().size(), 6UL);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
 nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
 nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
-  DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine)
+    DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
+    SERIAL)
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -201,11 +201,13 @@ if(WITH_DISTRIBUTE)
    set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
+    #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
+    #        listen_and_serv_op sum_op executor SERIAL)
    if(WITH_GPU)
        set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op listen_and_serv_op executor)
+        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op
+                listen_and_serv_op executor SERIAL)
        op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
        set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    else()

--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
--- a/paddle/fluid/operators/reduce_op.cc
+++ b/paddle/fluid/operators/reduce_op.cc
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
--- a/paddle/parameter/CMakeLists.txt
+++ b/paddle/parameter/CMakeLists.txt
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
--- a/paddle/scripts/travis/build_ios.sh
+++ b/paddle/scripts/travis/build_ios.sh
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/no_test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/no_test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
--- a/python/paddle/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py