diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1d0abdf2ceb4cf338dde782a97a6df906149655..c2fa5420e916fd5958f6198d6e97c9b1092b5aa1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -213,9 +213,11 @@ include(configure)          # add paddle env configuration
 if(WITH_GPU)
     include(cuda)
     include(tensorrt)
+endif()
+if(WITH_MKL OR WITH_MKLML)
     include(external/anakin)
 elseif()
-    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
+    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
 endif()
 
 include(generic)            # simplify cmake module
diff --git a/Dockerfile b/Dockerfile
index 402adee2ea2822250ebc8f6229fd6a44545d58e5..634be18a51bf61e96a8bf6f263b6674a7932d6e4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh
 #    and its size is only one-third of the official one.
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
     tar -xz -C /usr/local && \
     cp -rf /usr/local/TensorRT/include /usr && \
     cp -rf /usr/local/TensorRT/lib /usr
diff --git a/README.md b/README.md
index a67cb8ad439f462c361cb6bac2449c3a4b042126..60ffbe728178705b1734e682868614025214c2a4 100644
--- a/README.md
+++ b/README.md
@@ -76,33 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85
 
 ## Installation
 
-It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html)
-before looking into the
-[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html).
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website.
 
 ## Documentation
 
-We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
-[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation.
 
-- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)
+- [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html)
-
-   You can also run distributed training jobs on Kubernetes clusters.
-
-- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
 
    We appreciate your contributions!
 
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
index 707fadb1fae97cefe8a41715cd57d71754abda41..2e1e0d376899fd664866621263db62258e7c3869 100644
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -11,6 +11,7 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
 # exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
 
+
 RUN pip install -U pip
 RUN pip install -U kubernetes paddlepaddle
 
@@ -27,5 +28,6 @@ ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl 
 
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
+ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
 ADD models/ /workspace/models/
+
diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index a79f25ccc6ace1594f3f331633130eaace5e175b..ed696e82f8723eba573e8affd3f25e2aa6426e63 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -17,7 +17,8 @@ import argparse
 __all__ = ['parse_args', ]
 
 BENCHMARK_MODELS = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "se_resnext", "vgg", "mnist",
+    "stacked_dynamic_lstm", "resnet_with_preprocess"
 ]
 
 
@@ -67,12 +68,12 @@ def parse_args():
         '--cpus',
         type=int,
         default=1,
-        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+        help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
     parser.add_argument(
         '--data_set',
         type=str,
         default='flowers',
-        choices=['cifar10', 'flowers'],
+        choices=['cifar10', 'flowers', 'imagenet'],
         help='Optional dataset for benchmark.')
     parser.add_argument(
         '--infer_only', action='store_true', help='If set, run forward only.')
@@ -122,6 +123,11 @@ def parse_args():
         type=str,
         default="",
         help='Directory that contains all the training recordio files.')
+    parser.add_argument(
+        '--test_data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the test data (NOT recordio).')
     parser.add_argument(
         '--use_inference_transpiler',
         action='store_true',
@@ -130,5 +136,9 @@ def parse_args():
         '--no_random',
         action='store_true',
         help='If set, keep the random seed and do not shuffle the data.')
+    parser.add_argument(
+        '--use_lars',
+        action='store_true',
+        help='If set, use lars for optimizers, ONLY support resnet module.')
     args = parser.parse_args()
     return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 6b22f8f520e3d9c6c89d41a7455a6f9ebbad6d80..11bd75e1d09a6b51c7c749c512f2b71f3604f3fb 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -16,6 +16,7 @@ import argparse
 import cProfile
 import time
 import os
+import traceback
 
 import numpy as np
 
@@ -27,7 +28,7 @@ import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
 from args import *
 
 
-def append_nccl2_prepare(trainer_id):
+def append_nccl2_prepare(trainer_id, startup_prog):
     if trainer_id >= 0:
         # append gen_nccl_id at the end of startup program
         trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
@@ -40,11 +41,11 @@ def append_nccl2_prepare(trainer_id):
         current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
         worker_endpoints.remove(current_endpoint)
 
-        nccl_id_var = fluid.default_startup_program().global_block().create_var(
+        nccl_id_var = startup_prog.global_block().create_var(
             name="NCCLID",
             persistable=True,
             type=fluid.core.VarDesc.VarType.RAW)
-        fluid.default_startup_program().global_block().append_op(
+        startup_prog.global_block().append_op(
             type="gen_nccl_id",
             inputs={},
             outputs={"NCCLID": nccl_id_var},
@@ -59,7 +60,7 @@ def append_nccl2_prepare(trainer_id):
                         "nccl-based dist train.")
 
 
-def dist_transpile(trainer_id, args):
+def dist_transpile(trainer_id, args, train_prog, startup_prog):
     if trainer_id < 0:
         return None, None
 
@@ -80,132 +81,69 @@ def dist_transpile(trainer_id, args):
     # the role, should be either PSERVER or TRAINER
     training_role = os.getenv("PADDLE_TRAINING_ROLE")
 
-    t = distribute_transpiler.DistributeTranspiler()
+    config = distribute_transpiler.DistributeTranspilerConfig()
+    config.slice_var_up = not args.no_split_var
+    t = distribute_transpiler.DistributeTranspiler(config=config)
     t.transpile(
         trainer_id,
+        # NOTE: *MUST* use train_prog, for we are using with guard to
+        # generate different program for train and test.
+        program=train_prog,
         pservers=pserver_endpoints,
         trainers=trainers,
         sync_mode=not args.async_mode)
     if training_role == "PSERVER":
         pserver_program = t.get_pserver_program(current_endpoint)
-        pserver_startup_program = t.get_startup_program(current_endpoint,
-                                                        pserver_program)
+        pserver_startup_program = t.get_startup_program(
+            current_endpoint, pserver_program, startup_program=startup_prog)
         return pserver_program, pserver_startup_program
     elif training_role == "TRAINER":
         train_program = t.get_trainer_program()
-        return train_program, fluid.default_startup_program()
+        return train_program, startup_prog
     else:
         raise ValueError(
             'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
         )
 
 
-def test(exe, inference_program, test_reader, feeder, batch_acc):
-    accuracy_evaluator = fluid.metrics.Accuracy()
-    for batch_id, data in enumerate(test_reader()):
-        acc = exe.run(inference_program,
-                      feed=feeder.feed(data),
-                      fetch_list=[batch_acc])
-        accuracy_evaluator.update(value=np.array(acc), weight=len(data))
+def test_parallel(exe, test_args, args, test_prog, feeder):
+    acc_evaluators = []
+    for i in xrange(len(test_args[2])):
+        acc_evaluators.append(fluid.metrics.Accuracy())
 
-    return accuracy_evaluator.eval()
-
-
-# TODO(wuyi): replace train, train_parallel, test functions with new trainer
-# API once it is ready.
-def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
-          args, train_prog, startup_prog):
-    if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
-        place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-        exe.run(train_prog)
-        return
-
-    if args.use_fake_data:
-        raise Exception(
-            "fake data is not supported in single GPU test for now.")
-
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(startup_prog)
-
-    # Use inference_transpiler to speedup
-    if not args.use_reader_op:
-        feed_var_list = [
-            var for var in train_prog.global_block().vars.itervalues()
-            if var.is_data
-        ]
-        feeder = fluid.DataFeeder(feed_var_list, place)
-
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        train_losses = []
-        if not args.use_reader_op:
-            reader_generator = train_reader()
-        batch_id = 0
-        data = None
+    to_fetch = [v.name for v in test_args[2]]
+    if args.use_reader_op:
+        test_args[4].start()
         while True:
-            if not args.use_reader_op:
-                data = next(reader_generator, None)
-                if data == None:
-                    break
-            if iters == args.iterations:
+            try:
+                acc_rets = exe.run(fetch_list=to_fetch)
+                for i, e in enumerate(acc_evaluators):
+                    e.update(
+                        value=np.array(acc_rets[i]), weight=args.batch_size)
+            except fluid.core.EOFException as eof:
+                test_args[4].reset()
                 break
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
+    else:
+        for batch_id, data in enumerate(test_args[3]()):
+            acc_rets = exe.run(feed=feeder.feed(data), fetch_list=to_fetch)
+            for i, e in enumerate(acc_evaluators):
+                e.update(value=np.array(acc_rets[i]), weight=len(data))
 
-            if args.use_reader_op:
-                try:
-                    loss = exe.run(train_prog, fetch_list=[avg_loss])
-                except fluid.core.EnforceNotMet as ex:
-                    break
-            else:
-                loss = exe.run(train_prog,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_loss])
-            iters += 1
-            batch_id += 1
-            # FIXME(wuyi): For use_reader_op, if the current
-            # pass is not the last, the last batch of this pass
-            # is also equal to args.batch_size.
-            if args.use_reader_op:
-                num_samples += args.batch_size * args.gpus
-            else:
-                num_samples += len(data)
-            train_losses.append(loss)
-            print("Pass: %d, Iter: %d, Loss: %f\n" %
-                  (pass_id, iters, np.mean(train_losses)))
-        print_train_time(start_time, time.time(), num_samples)
-        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
-        # evaluation
-        if not args.no_test and batch_acc and not args.use_reader_op:
-            if args.use_inference_transpiler:
-                t = fluid.InferenceTranspiler()
-                t.transpile(infer_prog, place)
-
-            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
-                                 batch_acc)
-            print(", Test Accuracy: %f" % pass_test_acc)
-        print("\n")
-        # TODO(wuyi): add warmup passes to get better perf data.
-        exit(0)
+    return [e.eval() for e in acc_evaluators]
 
 
-# TODO(wuyi): replace train, train_parallel, test functions with new trainer
-# API once it is ready.
-def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
-                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
-                   num_trainers, trainer_id):
+# NOTE: only need to benchmark using parallelexe
+def train_parallel(train_args, test_args, args, train_prog, test_prog,
+                   startup_prog, nccl_id_var, num_trainers, trainer_id):
+    over_all_start = time.time()
     place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    feeder = None
     if not args.use_reader_op:
         feed_var_list = [
             var for var in train_prog.global_block().vars.itervalues()
             if var.is_data
         ]
         feeder = fluid.DataFeeder(feed_var_list, place)
-
     # generate fake:
     if args.use_fake_data:
         for var in feed_var_list:
@@ -229,62 +167,110 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
     startup_exe = fluid.Executor(place)
     startup_exe.run(startup_prog)
     strategy = fluid.ExecutionStrategy()
-    strategy.num_threads = 1
+    strategy.num_threads = args.cpus
     strategy.allow_op_delay = False
+    avg_loss = train_args[0]
+
+    if args.update_method == "pserver":
+        # parameter server mode distributed training, merge
+        # gradients on local server, do not initialize
+        # ParallelExecutor with multi server all-reduce mode.
+        num_trainers = 1
+        trainer_id = 0
+
     exe = fluid.ParallelExecutor(
         True,
         avg_loss.name,
+        main_program=train_prog,
         exec_strategy=strategy,
         num_trainers=num_trainers,
         trainer_id=trainer_id)
 
+    if not args.no_test:
+        if args.update_method == "pserver":
+            test_scope = None
+        else:
+            # NOTE: use an empty scope to avoid test exe using NCCLID
+            test_scope = fluid.Scope()
+        test_exe = fluid.ParallelExecutor(
+            True, main_program=test_prog, share_vars_from=exe)
+
     for pass_id in range(args.pass_num):
         num_samples = 0
         iters = 0
         start_time = time.time()
         if not args.use_reader_op:
-            reader_generator = train_reader()
+            reader_generator = train_args[3]()  #train_reader
         batch_id = 0
         data = None
+        if args.use_reader_op:
+            train_args[4].start()
         while True:
             if not args.use_reader_op:
                 data = next(reader_generator, None)
                 if data == None:
                     break
+            if args.profile and batch_id == 5:
+                profiler.start_profiler("All")
+                profiler.reset_profiler()
+            elif args.profile and batch_id == 10:
+                print("profiling total time: ", time.time() - start_time)
+                profiler.stop_profiler("total", "/tmp/profile_%d_pass%d" %
+                                       (trainer_id, pass_id))
             if iters == args.iterations:
+                reader_generator.close()
                 break
-            if args.profile and pass_id == 0 and batch_id == 5:
-                profiler.start_profiler("All")
-            elif args.profile and pass_id == 0 and batch_id == 10:
-                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
 
             if iters == args.skip_batch_num:
                 start_time = time.time()
                 num_samples = 0
+            fetch_list = [avg_loss.name]
+            acc_name_list = [v.name for v in train_args[2]]
+            fetch_list.extend(acc_name_list)
+
             if args.use_fake_data or args.use_reader_op:
                 try:
-                    loss, = exe.run([avg_loss.name])
+
+                    fetch_ret = exe.run(fetch_list)
+                except fluid.core.EOFException as eof:
+                    break
                 except fluid.core.EnforceNotMet as ex:
+                    traceback.print_exc()
                     break
             else:
-                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+                fetch_ret = exe.run(fetch_list, feed=feeder.feed(data))
             if args.use_reader_op:
                 num_samples += args.batch_size * args.gpus
             else:
                 num_samples += len(data)
+
             iters += 1
             if batch_id % 1 == 0:
-                print("Pass %d, batch %d, loss %s" %
-                      (pass_id, batch_id, np.array(loss)))
+                fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
+                print("Pass %d, batch %d, loss %s, accucacys: %s" %
+                      (pass_id, batch_id, fetched_data[0], fetched_data[1:]))
             batch_id += 1
 
         print_train_time(start_time, time.time(), num_samples)
-        if not args.no_test and batch_acc and not args.use_reader_op:
-            # we have not implement record io for test
-            # skip test when use args.use_reader_op
-            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
-                            batch_acc)
-            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
+        if args.use_reader_op:
+            train_args[4].reset()  # reset reader handle
+        else:
+            del reader_generator
+
+        if not args.no_test and test_args[2]:
+            test_feeder = None
+            if not args.use_reader_op:
+                test_feed_var_list = [
+                    var for var in test_prog.global_block().vars.itervalues()
+                    if var.is_data
+                ]
+                test_feeder = fluid.DataFeeder(test_feed_var_list, place)
+            test_ret = test_parallel(test_exe, test_args, args, test_prog,
+                                     test_feeder)
+            print("Pass: %d, Test Accuracy: %s\n" %
+                  (pass_id, [np.mean(np.array(v)) for v in test_ret]))
+
+    print("total train time: ", time.time() - over_all_start)
 
 
 def print_arguments(args):
@@ -326,44 +312,46 @@ def main():
     if args.use_cprof:
         pr = cProfile.Profile()
         pr.enable()
+
     model_def = __import__("models.%s" % args.model, fromlist=["models"])
-    train_args = list(model_def.get_model(args))
-    train_args.append(args)
-    # Run optimizer.minimize(avg_loss)
-    train_args[2].minimize(train_args[0])
-    if args.memory_optimize:
-        fluid.memory_optimize(fluid.default_main_program())
+
+    train_prog = fluid.Program()
+    test_prog = fluid.Program()
+    startup_prog = fluid.Program()
+
+    train_args = list(model_def.get_model(args, True, train_prog, startup_prog))
+    test_args = list(model_def.get_model(args, False, test_prog, startup_prog))
+
+    all_args = [train_args, test_args, args]
 
     if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile(trainer_id, args)
+        train_prog, startup_prog = dist_transpile(trainer_id, args, train_prog,
+                                                  startup_prog)
         if not train_prog:
             raise Exception(
                 "Must configure correct environments to run dist train.")
-        train_args.extend([train_prog, startup_prog])
+        all_args.extend([train_prog, test_prog, startup_prog])
         if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
-            train_args.extend([nccl_id_var, num_trainers, trainer_id])
-            train_parallel(*train_args)
-        train(*train_args)
+            all_args.extend([nccl_id_var, num_trainers, trainer_id])
+            train_parallel(*all_args)
+        elif os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
+            # start pserver with Executor
+            server_exe = fluid.Executor(fluid.CPUPlace())
+            server_exe.run(startup_prog)
+            server_exe.run(train_prog)
         exit(0)
 
     # for other update methods, use default programs
-    train_args.append(fluid.default_main_program())
-    train_args.append(fluid.default_startup_program())
+    all_args.extend([train_prog, test_prog, startup_prog])
 
     if args.update_method == "nccl2":
-        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
-    if args.gpus == 1:
-        # NOTE: parallel executor use profiler interanlly
-        if args.use_nvprof and args.device == 'GPU':
-            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
-                train(*train_args)
-        else:
-            train(*train_args)
-    else:
-        if args.device == "CPU":
-            raise Exception("Only support GPU perf with parallel exe")
-        train_args.extend([nccl_id_var, num_trainers, trainer_id])
-        train_parallel(*train_args)
+        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(
+            trainer_id, startup_prog)
+
+    if args.device == "CPU":
+        raise Exception("Only support GPU perf with parallel exe")
+    all_args.extend([nccl_id_var, num_trainers, trainer_id])
+    train_parallel(*all_args)
 
 
 if __name__ == "__main__":
diff --git a/benchmark/fluid/imagenet_reader.py b/benchmark/fluid/imagenet_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..a39485a61f12417fbdb512fc81e90ec49c310bf5
--- /dev/null
+++ b/benchmark/fluid/imagenet_reader.py
@@ -0,0 +1,344 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import random
+import functools
+import numpy as np
+from threading import Thread
+import subprocess
+import time
+
+from Queue import Queue
+import paddle
+from PIL import Image, ImageEnhance
+
+random.seed(0)
+
+DATA_DIM = 224
+
+THREAD = int(os.getenv("PREPROCESS_THREADS", "10"))
+BUF_SIZE = 5120
+
+DATA_DIR = '/mnt/ImageNet'
+TRAIN_LIST = '/mnt/ImageNet/train.txt'
+TEST_LIST = '/mnt/ImageNet/val.txt'
+
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+
+
+def resize_short(img, target_size):
+    percent = float(target_size) / min(img.size[0], img.size[1])
+    resized_width = int(round(img.size[0] * percent))
+    resized_height = int(round(img.size[1] * percent))
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    return img
+
+
+def crop_image(img, target_size, center):
+    width, height = img.size
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = random.randint(0, width - size)
+        h_start = random.randint(0, height - size)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img.crop((w_start, h_start, w_end, h_end))
+    return img
+
+
+def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
+    aspect_ratio = math.sqrt(random.uniform(*ratio))
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+
+    bound = min((float(img.size[0]) / img.size[1]) / (w**2),
+                (float(img.size[1]) / img.size[0]) / (h**2))
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+
+    target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
+                                                             scale_max)
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+
+    i = random.randint(0, img.size[0] - w)
+    j = random.randint(0, img.size[1] - h)
+
+    img = img.crop((i, j, i + w, j + h))
+    img = img.resize((size, size), Image.LANCZOS)
+    return img
+
+
+def rotate_image(img):
+    angle = random.randint(-10, 10)
+    img = img.rotate(angle)
+    return img
+
+
+def distort_color(img):
+    def random_brightness(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Brightness(img).enhance(e)
+
+    def random_contrast(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Contrast(img).enhance(e)
+
+    def random_color(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Color(img).enhance(e)
+
+    ops = [random_brightness, random_contrast, random_color]
+    random.shuffle(ops)
+
+    img = ops[0](img)
+    img = ops[1](img)
+    img = ops[2](img)
+
+    return img
+
+
+def process_image(sample, mode, color_jitter, rotate):
+    img_path = sample[0]
+
+    img = Image.open(img_path)
+    if mode == 'train':
+        if rotate: img = rotate_image(img)
+        img = random_crop(img, DATA_DIM)
+    else:
+        img = resize_short(img, target_size=256)
+        img = crop_image(img, target_size=DATA_DIM, center=True)
+    if mode == 'train':
+        if color_jitter:
+            img = distort_color(img)
+        if random.randint(0, 1) == 1:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
+    img -= img_mean
+    img /= img_std
+
+    if mode == 'train' or mode == 'val':
+        return img, sample[1]
+    elif mode == 'test':
+        return [img]
+
+
+class XmapEndSignal():
+    pass
+
+
+def xmap_readers(mapper,
+                 reader,
+                 process_num,
+                 buffer_size,
+                 order=False,
+                 print_queue_state=True):
+    end = XmapEndSignal()
+
+    # define a worker to read samples from reader to in_queue
+    def read_worker(reader, in_queue):
+        for i in reader():
+            in_queue.put(i)
+        in_queue.put(end)
+
+    # define a worker to read samples from reader to in_queue with order flag
+    def order_read_worker(reader, in_queue, file_queue):
+        in_order = 0
+        for i in reader():
+            in_queue.put((in_order, i))
+            in_order += 1
+        in_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue
+    def handle_worker(in_queue, out_queue, mapper):
+        sample = in_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            r = mapper(sample)
+            out_queue.put(r)
+            sample = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue by order
+    def order_handle_worker(in_queue, out_queue, mapper, out_order):
+        ins = in_queue.get()
+        while not isinstance(ins, XmapEndSignal):
+            order, sample = ins
+            r = mapper(sample)
+            while order != out_order[0]:
+                pass
+            out_queue.put(r)
+            out_order[0] += 1
+            ins = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    def xreader():
+        file_queue = Queue()
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+
+        sample = out_queue.get()
+        start_t = time.time()
+        while not isinstance(sample, XmapEndSignal):
+            yield sample
+            sample = out_queue.get()
+            if time.time() - start_t > 3:
+                if print_queue_state:
+                    print("queue sizes: ", in_queue.qsize(), out_queue.qsize())
+                start_t = time.time()
+        finish = 1
+        while finish < process_num:
+            sample = out_queue.get()
+            if isinstance(sample, XmapEndSignal):
+                finish += 1
+            else:
+                yield sample
+
+    return xreader
+
+
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False,
+                    xmap=True):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(full_lines)
+            if mode == 'train':
+                trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+                trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+                per_node_lines = len(full_lines) / trainer_count
+                lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
+                                   * per_node_lines]
+                print(
+                    "read images from %d, length: %d, lines length: %d, total: %d"
+                    % (trainer_id * per_node_lines, per_node_lines, len(lines),
+                       len(full_lines)))
+            else:
+                lines = full_lines
+
+            for line in lines:
+                if mode == 'train':
+                    img_path, label = line.split()
+                    img_path = img_path.replace("JPEG", "jpeg")
+                    img_path = os.path.join(DATA_DIR, "train", img_path)
+                    yield (img_path, int(label))
+                elif mode == 'val':
+                    img_path, label = line.split()
+                    img_path = img_path.replace("JPEG", "jpeg")
+                    img_path = os.path.join(DATA_DIR, "val", img_path)
+                    yield (img_path, int(label))
+                elif mode == 'test':
+                    img_path = os.path.join(DATA_DIR, line)
+                    yield [img_path]
+
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+
+    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
+
+
+def load_raw_image_uint8(sample):
+    img_arr = np.array(Image.open(sample[0])).astype('int64')
+    return img_arr, int(sample[1])
+
+
+def train_raw(file_list=TRAIN_LIST, shuffle=True):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(full_lines)
+
+            trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+            trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+            per_node_lines = len(full_lines) / trainer_count
+            lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1) *
+                               per_node_lines]
+            print("read images from %d, length: %d, lines length: %d, total: %d"
+                  % (trainer_id * per_node_lines, per_node_lines, len(lines),
+                     len(full_lines)))
+
+            for line in lines:
+                img_path, label = line.split()
+                img_path = img_path.replace("JPEG", "jpeg")
+                img_path = os.path.join(DATA_DIR, "train", img_path)
+                yield (img_path, int(label))
+
+    return paddle.reader.xmap_readers(load_raw_image_uint8, reader, THREAD,
+                                      BUF_SIZE)
+
+
+def train(file_list=TRAIN_LIST, xmap=True):
+    return _reader_creator(
+        file_list,
+        'train',
+        shuffle=True,
+        color_jitter=False,
+        rotate=False,
+        xmap=xmap)
+
+
+def val(file_list=TEST_LIST, xmap=True):
+    return _reader_creator(file_list, 'val', shuffle=False, xmap=xmap)
+
+
+def test(file_list=TEST_LIST):
+    return _reader_creator(file_list, 'test', shuffle=False)
+
+
+if __name__ == "__main__":
+    c = 0
+    start_t = time.time()
+    for d in train()():
+        c += 1
+        if c >= 10000:
+            break
+    spent = time.time() - start_t
+    print("read 10000 speed: ", 10000 / spent, spent)
diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
index dfe8b5cdd58456902fa8ec355e9837dface3f7be..c1f22f1bfa02dd409edc8e1c39a72524240f4088 100644
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -163,6 +163,19 @@ def gen_job():
         volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
         volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
 
+    # add ceph volumes
+    volumes.append({
+        "name": "ceph-data",
+        "cephfs": {
+            "monitors": ["192.168.16.23:6789"],
+            "secretRef": {
+                "name": "ceph-secret"
+            },
+            "user": "admin",
+        }
+    })
+    volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
+
     tn["spec"]["template"]["spec"]["volumes"] = volumes
     tn_container["volumeMounts"] = volumeMounts
 
diff --git a/benchmark/fluid/models/__init__.py b/benchmark/fluid/models/__init__.py
index 1c3fcac8dd4a1ba0496ef013bd4eb468a0075125..1b8f63c7070c2cd45531966b0bcdff95a848574d 100644
--- a/benchmark/fluid/models/__init__.py
+++ b/benchmark/fluid/models/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.
 
 __all__ = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
+    "resnet_with_preprocess"
 ]
diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
index 17f6b03826ae818a3671ea7f9355a8e8c04b50be..18163c35d65a28c046cfeb33f5b96c34a1a6a35a 100644
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """seq2seq model for fluid."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -181,7 +182,7 @@ def lodtensor_to_ndarray(lod_tensor):
     return ndarray
 
 
-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
     if args.use_reader_op:
         raise Exception("machine_translation do not support reader op for now.")
     embedding_dim = 512
@@ -190,30 +191,27 @@ def get_model(args):
     dict_size = 30000
     beam_size = 3
     max_length = 250
-    avg_cost, feeding_list = seq_to_seq_net(
-        embedding_dim,
-        encoder_size,
-        decoder_size,
-        dict_size,
-        dict_size,
-        False,
-        beam_size=beam_size,
-        max_length=max_length)
-
-    # clone from default main program
-    inference_program = fluid.default_main_program().clone()
-
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-
-    train_batch_generator = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=args.batch_size * args.gpus)
 
-    test_batch_generator = paddle.batch(
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            avg_cost, feeding_list = seq_to_seq_net(
+                embedding_dim,
+                encoder_size,
+                decoder_size,
+                dict_size,
+                dict_size,
+                False,
+                beam_size=beam_size,
+                max_length=max_length)
+    if is_train:
+        optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+        optimizer.minimize(avg_cost)
+
+    batch_generator = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.wmt14.test(dict_size), buf_size=1000),
-        batch_size=args.batch_size)
+            paddle.dataset.wmt14.train(dict_size)
+            if is_train else paddle.dataset.wmt14.test(dict_size),
+            buf_size=1000),
+        batch_size=args.batch_size * args.gpus)
 
-    return avg_cost, inference_program, optimizer, train_batch_generator, \
-           test_batch_generator, None
+    return avg_cost, optimizer, [], batch_generator, None
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
index 8e740dc6896b7eeeb82170aa13d32987c4df5c48..cef8657ee629dcbc19221fd3440844a56627e920 100644
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -65,61 +65,50 @@ def cnn_model(data):
     return predict
 
 
-def get_model(args):
-    if args.use_reader_op:
-        filelist = [
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-        ]
-        data_file = fluid.layers.open_files(
-            filenames=filelist,
-            shapes=[[-1, 1, 28, 28], (-1, 1)],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        images, label = fluid.layers.read_file(data_file)
-    else:
-        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if args.device == 'CPU' and args.cpus > 1:
-        places = fluid.layers.get_places(args.cpus)
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            predict = cnn_model(pd.read_input(images))
-            label = pd.read_input(label)
+def get_model(args, is_train, main_prog, startup_prog):
+    # NOTE: mnist is small, we don't implement data sharding yet.
+    filelist = [
+        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+    ]
+    with fluid.program_guard(main_prog, startup_prog):
+        if args.use_reader_op:
+            data_file_handle = fluid.layers.open_files(
+                filenames=filelist,
+                shapes=[[-1, 1, 28, 28], (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"],
+                thread_num=1,
+                pass_num=1)
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(
+                    data_file_handle, batch_size=args.batch_size))
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                input, label = fluid.layers.read_file(data_file)
+            else:
+                images = fluid.layers.data(
+                    name='pixel', shape=[1, 28, 28], dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            predict = cnn_model(images)
             cost = fluid.layers.cross_entropy(input=predict, label=label)
             avg_cost = fluid.layers.mean(x=cost)
+            # Evaluator
             batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-            pd.write_output(avg_cost)
-            pd.write_output(batch_acc)
-
-        avg_cost, batch_acc = pd()
-        avg_cost = fluid.layers.mean(avg_cost)
-        batch_acc = fluid.layers.mean(batch_acc)
-    else:
-        # Train program
-        predict = cnn_model(images)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        # Evaluator
-        batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-
-    # Optimization
-    opt = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.001, beta1=0.9, beta2=0.999)
+            # Optimization
+            if is_train:
+                opt = fluid.optimizer.AdamOptimizer(
+                    learning_rate=0.001, beta1=0.9, beta2=0.999)
+                opt.minimize()
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
 
     # Reader
-    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=args.batch_size)
-    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
+    if is_train:
+        reader = paddle.dataset.mnist.train()
+    else:
+        reader = paddle.dataset.mnist.test()
+    batched_reader = paddle.batch(
+        reader, batch_size=args.batch_size * args.gpus)
+    return avg_cost, opt, [batch_acc], batched_reader, data_file_handle
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index d44a9c07d31cfae9d54ad5949b85c77e60eae258..ae1baa48e17e40448e457052fd1464b9604a2128 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -27,10 +27,17 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
-from recordio_converter import imagenet_train, imagenet_test
+# from recordio_converter import imagenet_train, imagenet_test
+from imagenet_reader import train, val
 
 
-def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  act='relu',
+                  is_train=True):
     conv1 = fluid.layers.conv2d(
         input=input,
         filter_size=filter_size,
@@ -39,29 +46,31 @@ def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
         padding=padding,
         act=None,
         bias_attr=False)
-    return fluid.layers.batch_norm(input=conv1, act=act)
+    return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
 
 
-def shortcut(input, ch_out, stride):
+def shortcut(input, ch_out, stride, is_train=True):
     ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
     if ch_in != ch_out:
-        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        return conv_bn_layer(
+            input, ch_out, 1, stride, 0, None, is_train=is_train)
     else:
         return input
 
 
-def basicblock(input, ch_out, stride):
-    short = shortcut(input, ch_out, stride)
-    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+def basicblock(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
     return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
 
 
-def bottleneck(input, ch_out, stride):
-    short = shortcut(input, ch_out * 4, stride)
-    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
-    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+def bottleneck(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out * 4, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
+    conv3 = conv_bn_layer(
+        conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
     return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
 
 
@@ -72,7 +81,11 @@ def layer_warp(block_func, input, ch_out, count, stride):
     return res_out
 
 
-def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+def resnet_imagenet(input,
+                    class_dim,
+                    depth=50,
+                    data_format='NCHW',
+                    is_train=True):
 
     cfg = {
         18: ([2, 2, 2, 1], basicblock),
@@ -115,8 +128,9 @@ def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
     return out
 
 
-def get_model(args):
+def _model_reader_dshape_classdim(args, is_train):
     model = resnet_cifar10
+    reader = None
     if args.data_set == "cifar10":
         class_dim = 10
         if args.data_format == 'NCHW':
@@ -124,8 +138,10 @@ def get_model(args):
         else:
             dshape = [32, 32, 3]
         model = resnet_cifar10
-        train_reader = paddle.dataset.cifar.train10()
-        test_reader = paddle.dataset.cifar.test10()
+        if is_train:
+            reader = paddle.dataset.cifar.train10()
+        else:
+            reader = paddle.dataset.cifar.test10()
     elif args.data_set == "flowers":
         class_dim = 102
         if args.data_format == 'NCHW':
@@ -133,8 +149,10 @@ def get_model(args):
         else:
             dshape = [224, 224, 3]
         model = resnet_imagenet
-        train_reader = paddle.dataset.flowers.train()
-        test_reader = paddle.dataset.flowers.test()
+        if is_train:
+            reader = paddle.dataset.flowers.train()
+        else:
+            reader = paddle.dataset.flowers.test()
     elif args.data_set == "imagenet":
         class_dim = 1000
         if args.data_format == 'NCHW':
@@ -145,64 +163,89 @@ def get_model(args):
         if not args.data_path:
             raise Exception(
                 "Must specify --data_path when training with imagenet")
-        train_reader = imagenet_train(args.data_path)
-        test_reader = imagenet_test(args.data_path)
-
-    if args.use_reader_op:
-        filelist = [
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-        ]
-        data_file = fluid.layers.open_files(
-            filenames=filelist,
-            shapes=[[-1] + dshape, (-1, 1)],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        input, label = fluid.layers.read_file(data_file)
-    else:
-        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if args.device == 'CPU' and args.cpus > 1:
-        places = fluid.layers.get_places(args.cpus)
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            predict = model(pd.read_input(input), class_dim)
-            label = pd.read_input(label)
+        if not args.use_reader_op:
+            if is_train:
+                reader = train()
+            else:
+                reader = val()
+        else:
+            if is_train:
+                reader = train(xmap=False)
+            else:
+                reader = val(xmap=False)
+    return model, reader, dshape, class_dim
+
+
+def get_model(args, is_train, main_prog, startup_prog):
+    model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
+                                                                     is_train)
+
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=args.batch_size * args.gpus,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('float32', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            predict = model(input, class_dim, is_train=is_train)
             cost = fluid.layers.cross_entropy(input=predict, label=label)
             avg_cost = fluid.layers.mean(x=cost)
-            batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-            pd.write_output(avg_cost)
-            pd.write_output(batch_acc)
 
-        avg_cost, batch_acc = pd()
-        avg_cost = fluid.layers.mean(avg_cost)
-        batch_acc = fluid.layers.mean(batch_acc)
+            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
+            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
+
+            # configure optimize
+            optimizer = None
+            if is_train:
+                if args.use_lars:
+                    lars_decay = 1.0
+                else:
+                    lars_decay = 0.0
+
+                total_images = 1281167 / trainer_count
+
+                step = int(total_images / args.batch_size + 1)
+                epochs = [30, 60, 80, 90]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    learning_rate=base_lr,
+                    #learning_rate=fluid.layers.piecewise_decay(
+                    #    boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+
+    # config readers
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader if args.no_random else paddle.reader.shuffle(
+                reader, buf_size=5120),
+            batch_size=args.batch_size * args.gpus,
+            drop_last=True)
     else:
-        predict = model(input, class_dim)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-        batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc])
-
-    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-
-    batched_train_reader = paddle.batch(
-        train_reader if args.no_random else paddle.reader.shuffle(
-            train_reader, buf_size=5120),
-        batch_size=args.batch_size * args.gpus,
-        drop_last=True)
-    batched_test_reader = paddle.batch(
-        test_reader, batch_size=args.batch_size, drop_last=True)
-
-    return avg_cost, inference_program, optimizer, batched_train_reader,\
-                   batched_test_reader, batch_acc
+        batched_reader = None
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                reader if args.no_random else paddle.reader.shuffle(
+                    reader, buf_size=5120),
+                batch_size=args.batch_size))
+
+    return avg_cost, optimizer, [batch_acc1,
+                                 batch_acc5], batched_reader, pyreader
diff --git a/benchmark/fluid/models/resnet_with_preprocess.py b/benchmark/fluid/models/resnet_with_preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8d661d847516a15e4e28796960815935b82ae6f
--- /dev/null
+++ b/benchmark/fluid/models/resnet_with_preprocess.py
@@ -0,0 +1,268 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import numpy as np
+import time
+import os
+
+import cProfile, pstats, StringIO
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+# from recordio_converter import imagenet_train, imagenet_test
+from imagenet_reader import train_raw, val
+
+
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  act='relu',
+                  is_train=True):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
+
+
+def shortcut(input, ch_out, stride, is_train=True):
+    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(
+            input, ch_out, 1, stride, 0, None, is_train=is_train)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out * 4, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
+    conv3 = conv_bn_layer(
+        conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input,
+                    class_dim,
+                    depth=50,
+                    data_format='NCHW',
+                    is_train=True):
+
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+
+
+def _model_reader_dshape_classdim(args, is_train):
+    model = resnet_cifar10
+    reader = None
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+        model = resnet_cifar10
+        if is_train:
+            reader = paddle.dataset.cifar.train10()
+        else:
+            reader = paddle.dataset.cifar.test10()
+    elif args.data_set == "flowers":
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if is_train:
+            reader = paddle.dataset.flowers.train()
+        else:
+            reader = paddle.dataset.flowers.test()
+    elif args.data_set == "imagenet":
+        class_dim = 1000
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if not args.data_path:
+            raise Exception(
+                "Must specify --data_path when training with imagenet")
+        if not args.use_reader_op:
+            if is_train:
+                reader = train_raw()
+            else:
+                reader = val()
+        else:
+            if is_train:
+                reader = train_raw()
+            else:
+                reader = val(xmap=False)
+    return model, reader, dshape, class_dim
+
+
+def get_model(args, is_train, main_prog, startup_prog):
+    model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
+                                                                     is_train)
+
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=args.batch_size * args.gpus,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('uint8', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='uint8')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            # add imagenet preprocessors
+            random_crop = fluid.layers.random_crop(input, dshape)
+            casted = fluid.layers.cast(random_crop, 'float32')
+            # input is HWC
+            trans = fluid.layers.transpose(casted, [0, 3, 1, 2]) / 255.0
+            img_mean = fluid.layers.tensor.assign(
+                np.array([0.485, 0.456, 0.406]).astype('float32').reshape((3, 1,
+                                                                           1)))
+            img_std = fluid.layers.tensor.assign(
+                np.array([0.229, 0.224, 0.225]).astype('float32').reshape((3, 1,
+                                                                           1)))
+            h1 = fluid.layers.elementwise_sub(trans, img_mean, axis=1)
+            h2 = fluid.layers.elementwise_div(h1, img_std, axis=1)
+
+            # pre_out = (trans - img_mean) / img_std
+
+            predict = model(h2, class_dim, is_train=is_train)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+
+            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
+            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
+
+            # configure optimize
+            optimizer = None
+            if is_train:
+                if args.use_lars:
+                    lars_decay = 1.0
+                else:
+                    lars_decay = 0.0
+
+                total_images = 1281167 / trainer_count
+
+                step = int(total_images / args.batch_size + 1)
+                epochs = [30, 60, 80, 90]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    learning_rate=base_lr,
+                    #learning_rate=fluid.layers.piecewise_decay(
+                    #    boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+
+    # config readers
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader if args.no_random else paddle.reader.shuffle(
+                reader, buf_size=5120),
+            batch_size=args.batch_size * args.gpus,
+            drop_last=True)
+    else:
+        batched_reader = None
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                # reader if args.no_random else paddle.reader.shuffle(
+                #     reader, buf_size=5120),
+                reader,
+                batch_size=args.batch_size))
+
+    return avg_cost, optimizer, [batch_acc1,
+                                 batch_acc5], batched_reader, pyreader
diff --git a/benchmark/fluid/models/se_resnext.py b/benchmark/fluid/models/se_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f887fb324dc86a30b708b9ef04068282a3e6c3e
--- /dev/null
+++ b/benchmark/fluid/models/se_resnext.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import math
+import os
+from imagenet_reader import train, val
+
+__all__ = [
+    "SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d",
+    "SE_ResNeXt152_32x4d", "get_model"
+]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class SE_ResNeXt():
+    def __init__(self, layers=50, is_train=True):
+        self.params = train_parameters
+        self.layers = layers
+        self.is_train = is_train
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu')
+            conv = self.conv_bn_layer(
+                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(input=drop,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(input, ch_out, filter_size, stride)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, cardinality,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio)
+
+        short = self.shortcut(input, num_filters * 2, stride)
+
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(
+            input=conv, act=act, is_test=not self.is_train)
+
+    def squeeze_excitation(self, input, num_channels, reduction_ratio):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(input=pool,
+                                  size=num_channels / reduction_ratio,
+                                  act='relu',
+                                  param_attr=fluid.param_attr.ParamAttr(
+                                      initializer=fluid.initializer.Uniform(
+                                          -stdv, stdv)))
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(input=squeeze,
+                                     size=num_channels,
+                                     act='sigmoid',
+                                     param_attr=fluid.param_attr.ParamAttr(
+                                         initializer=fluid.initializer.Uniform(
+                                             -stdv, stdv)))
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+
+
+def SE_ResNeXt50_32x4d():
+    model = SE_ResNeXt(layers=50)
+    return model
+
+
+def SE_ResNeXt101_32x4d():
+    model = SE_ResNeXt(layers=101)
+    return model
+
+
+def SE_ResNeXt152_32x4d():
+    model = SE_ResNeXt(layers=152)
+    return model
+
+
+def get_model(args, is_train, main_prog, startup_prog):
+    model = SE_ResNeXt(layers=50)
+    batched_reader = None
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    dshape = train_parameters["input_size"]
+
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=10,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('float32', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            out = model.net(input=input)
+            cost = fluid.layers.cross_entropy(input=out, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+            acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+            optimizer = None
+            if is_train:
+                if args.use_lars:
+                    lars_decay = 1.0
+                else:
+                    lars_decay = 0.0
+
+                total_images = 1281167 / trainer_count
+
+                step = int(total_images / args.batch_size + 1)
+                epochs = [40, 80, 100]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    # learning_rate=base_lr,
+                    learning_rate=fluid.layers.piecewise_decay(
+                        boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4),
+                    LARS_weight_decay=lars_decay)
+                optimizer.minimize(avg_cost)
+
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+
+    # config readers
+    if is_train:
+        reader = train()
+    else:
+        reader = val()
+
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader, batch_size=args.batch_size * args.gpus, drop_last=True)
+    else:
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                reader, batch_size=args.batch_size))
+
+    return avg_cost, optimizer, [acc_top1, acc_top5], batched_reader, pyreader
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
index 3231542a17ace99a17c9f9b9bdb3c2527637d9ef..f23bb59de9158b0481320cc409879b3b72cbd43e 100644
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -26,7 +26,6 @@ import numpy
 import paddle
 import paddle.dataset.imdb as imdb
 import paddle.fluid as fluid
-import paddle.batch as batch
 import paddle.fluid.profiler as profiler
 
 word_dict = imdb.word_dict()
@@ -43,19 +42,7 @@ def crop_sentence(reader, crop_size):
     return __impl__
 
 
-def get_model(args):
-    if args.use_reader_op:
-        raise Exception(
-            "stacked_dynamic_lstm do not support reader op for now.")
-    lstm_size = 512
-    emb_dim = 512
-    crop_size = 1500
-
-    data = fluid.layers.data(
-        name="words", shape=[1], lod_level=1, dtype='int64')
-    sentence = fluid.layers.embedding(
-        input=data, size=[len(word_dict), emb_dim])
-
+def lstm_net(sentence, lstm_size):
     sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
 
     rnn = fluid.layers.DynamicRNN()
@@ -97,31 +84,47 @@ def get_model(args):
 
     last = fluid.layers.sequence_pool(rnn(), 'last')
     logit = fluid.layers.fc(input=last, size=2, act='softmax')
-    loss = fluid.layers.cross_entropy(
-        input=logit,
-        label=fluid.layers.data(
-            name='label', shape=[1], dtype='int64'))
-    loss = fluid.layers.mean(x=loss)
+    return logit
 
-    # add acc
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
-                shape=[1], dtype='int64'), total=batch_size_tensor)
 
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-
-    adam = fluid.optimizer.Adam()
+def get_model(args, is_train, main_prog, startup_prog):
+    if args.use_reader_op:
+        raise Exception(
+            "stacked_dynamic_lstm do not support reader op for now.")
+    lstm_size = 512
+    emb_dim = 512
+    crop_size = 1500
 
-    train_reader = batch(
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            data = fluid.layers.data(
+                name="words", shape=[1], lod_level=1, dtype='int64')
+            sentence = fluid.layers.embedding(
+                input=data, size=[len(word_dict), emb_dim])
+            logit = lstm_net(sentence, lstm_size)
+            loss = fluid.layers.cross_entropy(
+                input=logit,
+                label=fluid.layers.data(
+                    name='label', shape=[1], dtype='int64'))
+            loss = fluid.layers.mean(x=loss)
+
+            # add acc
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                        shape=[1], dtype='int64'), total=batch_size_tensor)
+
+            if is_train:
+                adam = fluid.optimizer.Adam()
+                adam.minimize(loss)
+
+    if is_train:
+        reader = crop_sentence(imdb.train(word_dict), crop_size)
+    else:
+        reader = crop_sentence(imdb.test(word_dict), crop_size)
+
+    batched_reader = paddle.batch(
         paddle.reader.shuffle(
-            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
+            reader, buf_size=25000),
         batch_size=args.batch_size * args.gpus)
-    test_reader = batch(
-        paddle.reader.shuffle(
-            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
-        batch_size=args.batch_size)
 
-    return loss, inference_program, adam, train_reader, test_reader, batch_acc
+    return loss, adam, [batch_acc], batched_reader, None
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
index 932601302d2f5d56b53e3462af886429034d8989..cf9708d500684465dc8ec1666bf269e7e1300f59 100644
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -25,7 +25,7 @@ import functools
 import os
 
 
-def vgg16_bn_drop(input):
+def vgg16_bn_drop(input, is_train=True):
     def conv_block(input, num_filter, groups, dropouts):
         return fluid.nets.img_conv_group(
             input=input,
@@ -46,13 +46,13 @@ def vgg16_bn_drop(input):
 
     drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
     fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    bn = fluid.layers.batch_norm(input=fc1, act='relu', is_test=not is_train)
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
     fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
     return fc2
 
 
-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
     if args.data_set == "cifar10":
         classdim = 10
         if args.data_format == 'NCHW':
@@ -65,57 +65,56 @@ def get_model(args):
             data_shape = [3, 224, 224]
         else:
             data_shape = [224, 224, 3]
+    filelist = [
+        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+    ]
+    with fluid.program_guard(main_prog, startup_prog):
+        if args.use_reader_op:
+            data_file_handle = fluid.layers.open_files(
+                filenames=filelist,
+                shapes=[[-1] + data_shape, (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"],
+                thread_num=1,
+                pass_num=1)
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(
+                    data_file_handle, batch_size=args.batch_size))
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                images, label = fluid.layers.read_file(data_file)
+            else:
+                images = fluid.layers.data(
+                    name='data', shape=data_shape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+            # Train program
+            net = vgg16_bn_drop(images, is_train=is_train)
+            predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
 
-    if args.use_reader_op:
-        filelist = [
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-        ]
-        data_file = fluid.layers.open_files(
-            filenames=filelist,
-            shapes=[[-1] + data_shape, (-1, 1)],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        images, label = fluid.layers.read_file(data_file)
-    else:
-        images = fluid.layers.data(
-            name='data', shape=data_shape, dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+            # Evaluator
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(
+                input=predict, label=label, total=batch_size_tensor)
+            # Optimization
+            if is_train:
+                optimizer = fluid.optimizer.Adam(
+                    learning_rate=args.learning_rate)
+                optimizer.minimize(avg_cost)
 
     # data reader
-    train_reader = paddle.batch(
+    if is_train:
+        reader = paddle.dataset.cifar.train10() \
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train()
+    else:
+        reader = paddle.dataset.cifar.test10() \
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test()
+
+    batched_reader = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
+            reader, buf_size=5120),
         batch_size=args.batch_size * args.gpus)
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=args.batch_size)
 
-    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
+    return avg_cost, optimizer, [batch_acc], batched_reader, data_file_handle
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index b520c03a836a9e3f263ba050f151877ffe0d071d..03c73786a6c31868b1893bfcb319e43e37db1a3d 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -169,14 +169,19 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
+if (NOT WIN32) # windows msvc2015 support c++11 natively. 
+# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
-list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+endif(NOT WIN32)
+
+list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 # in cuda9, suppress cuda warning on eigen 
 list(APPEND CUDA_NVCC_FLAGS "-w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
 
+if (NOT WIN32)
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
     list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
@@ -187,6 +192,13 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
     # nvcc 9 does not support -Os. Use Release flags instead
     list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
+else(NOT WIN32)
+if(CMAKE_BUILD_TYPE STREQUAL "Release")
+  list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
+else()
+  message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.")
+endif()
+endif(NOT WIN32)
 
 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
 mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index dc6730662f0b888f1981ac9c086320acc52d0a50..ed054ff41ae0ec5a4b31dd256e397129cba3e8f1 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -16,16 +16,6 @@ set(ANAKIN_LIBRARY     ${ANAKIN_INSTALL_DIR})
 set(ANAKIN_SHARED_LIB  ${ANAKIN_LIBRARY}/libanakin.so)
 set(ANAKIN_SABER_LIB   ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
 
-# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
-set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
-set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
-set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
-set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
-
 include_directories(${ANAKIN_INCLUDE})
 include_directories(${ANAKIN_INCLUDE}/saber/)
 include_directories(${ANAKIN_INCLUDE}/saber/core/)
@@ -48,21 +38,24 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
     -Wno-reorder
     -Wno-error=cpp)
 
+if(WITH_GPU)
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=YES -DCUDNN_ROOT=${CUDNN_ROOT} -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR})
+else()
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=NO)
+endif()
 ExternalProject_Add(
     extern_anakin
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLML_PROJECT}
     GIT_REPOSITORY      "https://github.com/PaddlePaddle/Anakin"
-    GIT_TAG             "9424277cf9ae180a14aff09560d3cd60a49c76d2"
+    GIT_TAG             "3c8554f4978628183566ab7dd6c1e7e66493c7cd"
     PREFIX              ${ANAKIN_SOURCE_DIR}
     UPDATE_COMMAND      ""
-    CMAKE_ARGS          -DUSE_GPU_PLACE=YES
+    CMAKE_ARGS          ${CMAKE_ARGS_PREFIX}
                         -DUSE_X86_PLACE=YES
                         -DBUILD_WITH_UNIT_TEST=NO
                         -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                         -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-                        -DCUDNN_ROOT=${CUDNN_ROOT}
-                        -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
                         -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
                         ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 7fb67afbe15a5a019c978092d5ba3a4a0f66d996..fd9835d023c67b76579913f2ec56c2444fea8c15 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -44,7 +44,7 @@ ExternalProject_Add(
     # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
     #    checkout and clean other dirs under third_party
     # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
     URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index bc36683a9facc253e7b9feb0c5a56e79491fb9b0..077072f6eadb0c48f4ae32f94828613d89ed01c9 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 if (NOT WIN32)
-copy(framework_lib DEPS framework_py_proto 
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
-)
-else()
-copy(framework_lib
+set(framework_lib_deps framework_py_proto)
+endif(NOT WIN32)
+copy(framework_lib DEPS ${framework_lib_deps}
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+       ${src_dir}/${module}/ir/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
 )
-endif(NOT WIN32)
 
 set(module "memory")
 copy(memory_lib
@@ -148,12 +145,12 @@ copy(memory_lib
 set(inference_deps paddle_fluid_shared paddle_fluid)
 
 set(module "inference/api")
-if (WITH_ANAKIN AND WITH_GPU)
+if (WITH_ANAKIN AND WITH_MKL)
     copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
         SRCS
         ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
         ${ANAKIN_INSTALL_DIR} # anakin release
-        DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
+        DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
      list(APPEND inference_deps anakin_inference_lib)
 endif()
 
@@ -161,7 +158,8 @@ set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
   SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
        ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
 set(module "platform")
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index ecbd8191ccf5aa6046e7875fe8afa2ed0105e4a0..6f0267cd7a1d0afcdcb1596a46ffe2d15eea100d 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -822,6 +822,14 @@ pad
 ..  autofunction:: paddle.fluid.layers.pad
     :noindex:
 
+.. _api_fluid_layers_pad_constant_like:
+
+pad_constant_like
+---
+
+..  autofunction:: paddle.fluid.layers.pad_constant_like
+    :noindex:
+
 .. _api_fluid_layers_label_smooth:
 
 label_smooth
@@ -1145,6 +1153,14 @@ sigmoid
 ..  autofunction:: paddle.fluid.layers.sigmoid
     :noindex:
 
+.. _api_fluid_layers_hsigmoid:
+
+hsigmoid
+-------
+
+..  autofunction:: paddle.fluid.layers.hsigmoid
+    :noindex:
+
 .. _api_fluid_layers_logsigmoid:
 
 logsigmoid
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
index b782242a6632a5d42a512cf3b830d6e047c064ab..e4682ccb94e6fc60e184632dff9ee16a6bf16ec0 100644
--- a/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
@@ -1,5 +1,5 @@
-服务器端部署 - Anakin
-#####################
+Anakin - 服务器端加速引擎
+#######################
 
 
 使用文档
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst b/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst
deleted file mode 100644
index a5209e8560b31e9f0f776fba9a2b8c5bc150165c..0000000000000000000000000000000000000000
--- a/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-服务器端部署 - 原生引擎
-#######################
-
-..  toctree::
-    :maxdepth: 2
-
-    build_and_install_lib_cn.rst
-    native_infer.rst
diff --git a/doc/fluid/new_docs/advanced_usage/index.rst b/doc/fluid/new_docs/advanced_usage/index.rst
index dea7c236619a0bdbf402f371571d947d1cdbba65..89166573eebca045e948046c69f3b7a3e0031d58 100644
--- a/doc/fluid/new_docs/advanced_usage/index.rst
+++ b/doc/fluid/new_docs/advanced_usage/index.rst
@@ -10,7 +10,6 @@
 ..  toctree::
     :maxdepth: 2
 
-    deploy/index_native.rst
     deploy/index_anakin.rst
     deploy/index_mobile.rst
     development/contribute_to_paddle.md
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..dc7c62b06287ad333dd41082e566b0553d3a5341
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
@@ -0,0 +1,8 @@
+*.pyc
+train.log
+output
+data/cifar-10-batches-py/
+data/cifar-10-python.tar.gz
+data/*.txt
+data/*.list
+data/mean.meta
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
index 8d645718e12e4d976a8e71de105e11f495191fbf..4f20843596aa676962a36241f59560ec2a41257b 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
@@ -21,7 +21,7 @@
 图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果，即模型可以正确识别图像上的主要物体。
 
 <p align="center">
-<img src="image/dog_cat.png "  width="350" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/dog_cat.png?raw=true"  width="350" ><br/>
 图1. 通用图像分类展示
 </p>
 
@@ -30,7 +30,7 @@
 
 
 <p align="center">
-<img src="image/flowers.png" width="400" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/flowers.png?raw=true" width="400" ><br/>
 图2. 细粒度图像分类展示
 </p>
 
@@ -38,7 +38,7 @@
 一个好的模型既要对不同类别识别正确，同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动，较好的模型会像聪明的人类一样能够正确识别。
 
 <p align="center">
-<img src="image/variations.png" width="550" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/variations.png?raw=true" width="550" ><br/>
 图3. 扰动图片展示[22]
 </p>
 
@@ -61,7 +61,7 @@
 Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破，效果大幅度超越传统方法，获得了ILSVRC2012冠军，该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后，涌现了一系列CNN模型，不断地在ImageNet上刷新成绩，如图4展示。随着模型变得越来越深以及精妙的结构设计，Top-5的错误率也越来越低，降到了3.5%附近。而在同样的ImageNet数据集上，人眼的辨识错误率大概在5.1%，也就是目前的深度学习模型的识别能力已经超过了人眼。
 
 <p align="center">
-<img src="image/ilsvrc.png" width="500" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/ilsvrc.png?raw=true" width="500" ><br/>
 图4. ILSVRC图像分类Top-5错误率
 </p>
 
@@ -70,7 +70,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
 传统CNN包含卷积层、全连接层等组件，并采用softmax多类别分类器和多类交叉熵损失函数，一个典型的卷积神经网络如图5所示，我们先介绍用来构造CNN的常见组件。
 
 <p align="center">
-<img src="image/lenet.png"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/lenet.png?raw=true"><br/>
 图5. CNN网络示例[20]
 </p>
 
@@ -89,7 +89,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
 牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构，它的核心是五组卷积操作，每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积，卷积核的数目由较浅组的64增多到最深组的512，同一组内的卷积核数目是一样的。卷积之后接两层全连接层，之后是分类层。由于每组内卷积层的不同，有11、13、16、19层这几种模型，下图展示一个16层的网络结构。VGG模型结构相对简洁，提出之后也有很多文章基于此模型进行研究，如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。
 
 <p align="center">
-<img src="image/vgg16.png" width="750" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/vgg16.png?raw=true" width="750" ><br/>
 图6. 基于ImageNet的VGG16模型
 </p>
 
@@ -106,7 +106,7 @@ NIN模型主要有两个特点：
 Inception模块如下图7所示，图(a)是最简单的设计，输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数，拼接后会导致特征的通道数较大，经过几层这样的模块堆积后，通道数会越来越大，导致参数和计算量也随之增大。为了改善这个缺点，图(b)引入3个1x1卷积层进行降维，所谓的降维就是减少通道数，同时如NIN模型中提到的1x1卷积也可以修正线性特征。
 
 <p align="center">
-<img src="image/inception.png" width="800" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/inception.png?raw=ture" width="800" ><br/>
 图7. Inception模块
 </p>
 
@@ -115,7 +115,7 @@ GoogleNet由多组Inception模块堆积而成。另外，在网络最后也没
 GoogleNet整体网络结构如图8所示，总共22层网络：开始由3层普通的卷积组成；接下来由三组子网络组成，第一组子网络包含2个Inception模块，第二组包含5个Inception模块，第三组包含2个Inception模块；然后接均值池化层、全连接层。
 
 <p align="center">
-<img src="image/googlenet.jpeg" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/googlenet.jpeg?raw=true" ><br/>
 图8. GoogleNet[12]
 </p>
 
@@ -130,14 +130,14 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
 残差模块如图9所示，左边是基本模块连接方式，由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式，之所以称为瓶颈，是因为上面的1x1卷积用来降维(图示例即256->64)，下面的1x1卷积用来升维(图示例即64->256)，这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。
 
 <p align="center">
-<img src="image/resnet_block.jpg" width="400"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet_block.jpg?raw=true" width="400"><br/>
 图9. 残差模块
 </p>
 
 图10展示了50、101、152层网络连接示意图，使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快，成功的训练了上百乃至近千层的卷积神经网络。
 
 <p align="center">
-<img src="image/resnet.png"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet.png?raw=true"><br/>
 图10. 基于ImageNet的ResNet模型
 </p>
 
@@ -149,7 +149,7 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
 由于ImageNet数据集较大，下载和训练较慢，为了方便大家学习，我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片，10个类别，每个类包含6,000张。其中50,000张图片作为训练集，10000张作为测试集。图11从每个类别中随机抽取了10张图片，展示了所有的类别。
 
 <p align="center">
-<img src="image/cifar.png" width="350"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/cifar.png?raw=true" width="350"><br/>
 图11. CIFAR10数据集[21]
 </p>
 
@@ -377,7 +377,7 @@ test_reader = paddle.batch(
 `event_handler_plot`可以用来利用回调数据来打点画图:
 
 <p align="center">
-<img src="image/train_and_test.png" width="350"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/train_and_test.png?raw=true" width="350"><br/>
 图12. 训练结果
 </p>
 
@@ -469,7 +469,7 @@ Test with Pass 0, Loss 1.1, Acc 0.6
 图13是训练的分类错误率曲线图，运行到第200个pass后基本收敛，最终得到测试集上分类错误率为8.54%。
 
 <p align="center">
-<img src="image/plot.png" width="400" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/plot.png?raw=true" width="400" ><br/>
 图13. CIFAR10数据集上VGG模型的分类错误率
 </p>
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/cifar.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/cifar.png
deleted file mode 100644
index f3c5f2f7b0c84f83382b70124dcd439586ed4eb0..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/cifar.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png
deleted file mode 100644
index ca8f858a902ea723d886d2b88c2c0a1005301c50..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png
deleted file mode 100644
index 38b21f21604b1bb84fc3f6aa96bd5fce45d15a55..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png
deleted file mode 100644
index 647c822e52cd55d50e5f207978f5e6ada86cf34c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png
deleted file mode 100644
index 04245cef60fe7126ae4c92ba8085273965078bee..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg
deleted file mode 100644
index 249dbf96df61c3352ea5bd80470f6c4a1e03ff10..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png
deleted file mode 100644
index 4660ac122e9d533023a21154d35eee29e3b08d27..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png
deleted file mode 100644
index 9591a0c1e8c0165c40ca560be35a7b9a91cd5027..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception_en.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception_en.png
deleted file mode 100644
index 39580c20b583f2a15d17fd124a572c84e6e2db1d..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png
deleted file mode 100644
index 77f785e03bacd38c4c64a817874a58ff3298d2f3..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet_en.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet_en.png
deleted file mode 100644
index 97a1e3eee45c0db95e6a943ca3b8c0cf6c34d4b6..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png
deleted file mode 100644
index 57e45cc0c27dd99b9918de2ff1228bc6b65f7424..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot_en.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot_en.png
deleted file mode 100644
index 147e575bf49086811c43420d5a9c8f749e2da405..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png
deleted file mode 100644
index 0aeb4f254639fdbf18e916dc219ca61602596d85..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg
deleted file mode 100644
index c500eb01a90190ff66150871fe83ec275e2de8d7..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png
deleted file mode 100644
index c6336a9a69b95dc978719ce68896e3e752e67fed..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations.png
deleted file mode 100644
index b4ebbbe6a50f5fd7cd0cccb52cdac5653e34654c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations_en.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations_en.png
deleted file mode 100644
index 88c60fe87f802c5ce560bb15bbdbd229aeafc4e4..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png
deleted file mode 100644
index 6270eefcfd7071bc1643ee06567e5b81aaf4c177..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/index.rst b/doc/fluid/new_docs/beginners_guide/basics/index.rst
index e1fd226116d88fbf137741242b304b367e598ba5..0fcb008e0a7773e81e5124da09fe07366130b924 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/index.rst
+++ b/doc/fluid/new_docs/beginners_guide/basics/index.rst
@@ -6,7 +6,7 @@
 ..  todo::
 
     概述
-
+    
 ..  toctree::
     :maxdepth: 2
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..29b5622a53a1b0847e9f53febf1cc50dcf4f044a
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
@@ -0,0 +1,12 @@
+data/train.list
+data/test.*
+data/conll05st-release.tar.gz
+data/conll05st-release
+data/predicate_dict
+data/label_dict
+data/word_dict
+data/emb
+data/feature
+output
+predict.res
+train.log
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
index 47e948bd1ffc0ca692dc9899193e94831ce4234b..0891f5b6b16a1b715b44db6c47ba079adfcad4c5 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
@@ -21,7 +21,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 5. 对第4步的结果，通过多分类得到论元的语义角色标签。可以看到，句法分析是基础，并且后续步骤常常会构造的一些人工特征，这些特征往往也来自句法分析。
 
 <div  align="center">
-<img src="image/dependency_parsing.png" width = "80%" align=center /><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/dependency_parsing.png?raw=true" width = "80%" align=center /><br>
 图1. 依存句法分析句法树示例
 </div>
 
@@ -30,7 +30,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 我们继续以上面的这句话为例，图1展示了BIO表示方法。
 
 <div  align="center">
-<img src="image/bio_example.png" width = "90%"  align=center /><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bio_example.png?raw=true" width = "90%"  align=center /><br>
 图2. BIO标注方法示例
 </div>
 
@@ -53,7 +53,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 图3是最终得到的栈式循环神经网络结构示意图。
 
 <p align="center">  
-<img src="./image/stacked_lstm.png" width = "40%"  align=center><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/stacked_lstm.png?raw=true" width = "40%"  align=center><br>
 图3. 基于LSTM的栈式循环神经网络结构示意图
 </p>
 
@@ -64,7 +64,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 为了克服这一缺陷，我们可以设计一种双向循环网络单元，它的思想简单且直接：对上一节的栈式循环神经网络进行一个小小的修改，堆叠多个LSTM单元，让每一层LSTM单元分别以：正向、反向、正向 …… 的顺序学习上一层的输出序列。于是，从第2层开始，$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。
 
 <p align="center">  
-<img src="./image/bidirectional_stacked_lstm.png" width = "60%" align=center><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bidirectional_stacked_lstm.png?raw=true" width = "60%" align=center><br>
 图4. 基于LSTM的双向循环神经网络结构示意图
 </p>
 
@@ -79,7 +79,7 @@ CRF是一种概率化结构模型，可以看作是一个概率无向图模型
 序列标注任务只需要考虑输入和输出都是一个线性序列，并且由于我们只是将输入序列作为条件，不做任何条件独立假设，因此输入序列的元素之间并不存在图结构。综上，在序列标注任务中使用的是如图5所示的定义在链式图上的CRF，称之为线性链条件随机场（Linear Chain Conditional Random Field）。
 
 <p align="center">  
-<img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/linear_chain_crf.png?raw=true" width = "35%" align=center><br>
 图5. 序列标注任务中使用的线性链条件随机场
 </p>
 
@@ -123,7 +123,7 @@ $$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\pr
 4. CRF以第3步中LSTM学习到的特征为输入，以标记序列为监督信号，完成序列标注；
 
 <div  align="center">  
-<img src="image/db_lstm_network.png" width = "60%"  align=center /><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/db_lstm_network.png?raw=true" width = "60%"  align=center /><br>
 图6. SRL任务上的深层双向LSTM模型
 </div>
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png
deleted file mode 100644
index e63f5ebd6d00f2e4ecf97b9ab2027e74683013f2..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png
deleted file mode 100644
index f0a195c24d9ee493f96bb93c28a99e70566be7a4..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png
deleted file mode 100644
index e5f7151c9fcc50a7cf7af485cbbc7e4fccab0c20..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png
deleted file mode 100644
index 93b44dd4874402ef29ad7bd7d94147609b92e309..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png
deleted file mode 100644
index 592f7ee23bdc88a9a35059612e5ab880bbc9d34b..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png
deleted file mode 100644
index c3646312e48db977402fb353dc0c9b4d02269bf4..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png
deleted file mode 100644
index 9265b671735940ed6549e2980064d2ce08baae64..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png
deleted file mode 100644
index 23f4f45b603e3d60702af2b2464d10fc8deed061..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png
deleted file mode 100644
index 0778fda74b2ad22ce4b631791a7b028cdef780a5..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png
deleted file mode 100644
index 3d2914c726b5f4c46e66dfa85d4e88649fede6b3..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png
deleted file mode 100644
index 0b944ef91e8b5ba4b14d2a35bd8879f261cf8f61..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6129b9e8645010fcb8372d9dc3dbb568dfa80907
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
@@ -0,0 +1,9 @@
+data/wmt14
+data/pre-wmt14
+pretrained/wmt14_model
+gen.log
+gen_result
+train.log
+dataprovider_copy_1.py
+*.pyc
+multi-bleu.perl
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
index f37c559921483a3d7c619ed74903df56b0584bd5..6e5f77fec8a894c390ced8c93ee344fd8d27370e 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
@@ -11,10 +11,10 @@
 为解决以上问题，统计机器翻译（Statistical Machine Translation, SMT）技术应运而生。在统计机器翻译技术中，转化规则是由机器自动从大规模的语料中学习得到的，而非我们人主动提供规则。因此，它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题，但仍然存在许多挑战：1）人为设计许多特征（feature），但永远无法覆盖所有的语言现象；2）难以利用全局的特征；3）依赖于许多预处理环节，如词语对齐、分词或符号化（tokenization）、规则抽取、句法分析等，而每个环节的错误会逐步累积，对翻译的影响也越来越大。
 
 近年来，深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类：1）仍以统计机器翻译系统为框架，只是利用神经网络来改进其中的关键模块，如语言模型、调序模型等（见图1的左半部分）；2）不再以统计机器翻译系统为框架，而是直接用神经网络将源语言映射到目标语言，即端到端的神经网络机器翻译（End-to-End Neural Machine Translation, End-to-End NMT）（见图1的右半部分），简称为NMT模型。
-![nmt](./image/nmt.png)
-<p align="center">
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/nmt.png?raw=true" width = "400" align=center/><br/>
 图1. 基于神经网络的机器翻译系统
-</p>
+</div>
 
 本教程主要介绍NMT模型，以及如何用PaddlePaddle来训练一个NMT模型。
 
@@ -45,19 +45,22 @@
 
 具体来说，该双向循环神经网络分别在时间维以顺序和逆序——即前向（forward）和后向（backward）——依次处理输入序列，并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点，都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN，其中有六个权重矩阵：输入到前向隐层和后向隐层的权重矩阵（`$W_1, W_3$`），隐层到隐层自己的权重矩阵（`$W_2,W_5$`），前向隐层和后向隐层到输出层的权重矩阵（`$W_4, W_6$`）。注意，该网络的前向隐层和后向隐层之间没有连接。
 
-![bi_rnn](./image/bi_rnn.png)
-<p align="center">
-图3. 按时间步展开的双向循环神经网络
-</p>
+
+<div align="center">
+<img src = "https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/bi_rnn.png?raw=true" width="400"><br/>
+图2. 按时间步展开的双向循环神经网络
+</div>
 
 ### 编码器-解码器框架
 
 编码器-解码器（Encoder-Decoder）\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量，解码阶段通过最大化预测序列概率，从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。
 ![encoder_decoder](./image/encoder_decoder.png)
-<p align="center">
-图4. 编码器-解码器框架
-</p>
+<div align="center">
+<img src ="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_decoder.png?raw=true" width="400"><br/>
+图3. 编码器-解码器框架
+</div>
 
+<a name="编码器"></a>
 #### 编码器
 
 编码阶段分为三步：
@@ -69,19 +72,17 @@
 3. 用RNN编码源语言词序列：这一过程的计算公式为`$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$`，其中`$h_0$`是一个全零的向量，`$\varnothing _\theta$`是一个非线性激活函数，最后得到的`$\mathbf{h}=\left \{ h_1,..., h_T \right \}$`就是RNN依次读入源语言`$T$`个词的状态编码序列。整句话的向量表示可以采用`$\mathbf{h}$`在最后一个时间步`$T$`的状态编码，或使用时间维上的池化（pooling）结果。
 
 第3步也可以使用双向循环神经网络实现更复杂的句编码表示，具体可以用双向GRU实现。前向GRU按照词序列`$(x_1,x_2,...,x_T)$`的顺序依次编码源语言端词，并得到一系列隐层状态`$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$`。类似的，后向GRU按照`$(x_T,x_{T-1},...,x_1)$`的顺序依次编码源语言端词，得到`$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$`。最后对于词`$x_i$`，通过拼接两个GRU的结果得到它的隐层状态，即`$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$`。
-
-![encoder_attention](./image/encoder_attention.png)
-<p align="center">
-图5. 使用双向GRU的编码器
-</p>
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_attention.png?raw=true" width="400"><br/>
+图4. 使用双向GRU的编码器
+</div>
 
 #### 解码器
 
 机器翻译任务的训练过程中，解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是：
-
 1. 每一个时刻，根据源语言句子的编码信息（又叫上下文向量，context vector）`$c$`、真实目标语言序列的第`$i$`个词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。计算公式如下：
 $$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$
-其中`$\phi _{\theta '}$`是一个非线性激活函数；`$c=q\mathbf{h}$`是源语言句子的上下文向量，在不使用[注意力机制](#注意力机制)时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义`$c=h_T$`；`$u_i$`是目标语言序列的第`$i$`个单词，`$u_0$`是目标语言序列的开始标记`<s>`，表示解码开始；`$z_i$`是`$i$`时刻解码RNN的隐层状态，`$z_0$`是一个全零的向量。
+其中`$\phi _{\theta '}$`是一个非线性激活函数；`$c=q\mathbf{h}$`是源语言句子的上下文向量，在不使用注意力机制时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义`$c=h_T$`；`$u_i$`是目标语言序列的第`$i$`个单词，`$u_0$`是目标语言序列的开始标记`<s>`，表示解码开始；`$z_i$`是`$i$`时刻解码RNN的隐层状态，`$z_0$`是一个全零的向量。
 
 2. 将`$z_{i+1}$`通过`softmax`归一化，得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下：
 $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
@@ -93,6 +94,7 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
 
 机器翻译任务的生成过程，通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异，具体介绍请见[柱搜索算法](#柱搜索算法)。
 
+<a name="柱搜索算法"></a>
 ### 柱搜索算法
 
 柱搜索（[beam search](http://en.wikipedia.org/wiki/Beam_search)）是一种启发式图搜索算法，用于在图或树中搜索有限集合中的最优扩展节点，通常用在解空间非常大的系统（如机器翻译、语音识别）中，原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”，就算目标语言字典中只有3个词（`<s>`, `<e>`, `hello`），也可能生成无限句话（`hello`循环出现的次数不定），为了找到其中较好的翻译结果，我们可采用柱搜索算法。
@@ -100,7 +102,6 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
 柱搜索算法使用广度优先策略建立搜索树，在树的每一层，按照启发代价（heuristic cost）（本教程中，为生成词的log概率之和）对节点进行排序，然后仅留下预先确定的个数（文献中通常称为beam width、beam size、柱宽度等）的节点。只有这些节点会在下一层继续扩展，其他节点就被剪掉了，也就是说保留了质量较高的节点，剪枝了质量较差的节点。因此，搜索所占用的空间和时间大幅减少，但缺点是无法保证一定获得最优解。
 
 使用柱搜索算法的解码阶段，目标是最大化生成序列的概率。思路是：
-
 1. 每一个时刻，根据源语言句子的编码信息`$c$`、生成的第`$i$`个目标语言序列单词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。
 
 2. 将`$z_{i+1}$`通过`softmax`归一化，得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png
deleted file mode 100644
index 9d8efd50a49d0305586f550344472ab94c93bed3..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png
deleted file mode 100644
index 4b35c88fc8ea2c503473c0c15711744e784d6af6..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png
deleted file mode 100644
index 1b355e7786d25487a3f564af758c2c52c43b4690..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png
deleted file mode 100644
index 3728f782ee09d9308d02b42305027b2735467ead..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png
deleted file mode 100644
index 28d7a15a3bd65262bde22a3f41b5aa78b46b368a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png
deleted file mode 100644
index ea8585565da1ecaf241654c278c6f9b15e283286..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png
deleted file mode 100644
index 60aee0017de73f462e35708b1055aff8992c03e1..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png
deleted file mode 100644
index 6b73798fe632e0873b35c117b86f347c8cf3116a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png
deleted file mode 100644
index 0cde685b84106650a4df18ce335a23e6338d3d11..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png
deleted file mode 100644
index a6af429f23f0f7e82650139bbd8dcbef27a34abe..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png
deleted file mode 100644
index bf56d73ebf297fadf522389c7b6836dd379aa097..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png
deleted file mode 100644
index 557310e044b2b6687e5ea6895417ed946ac7bc11..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f23901aeb3a9e7cd12611fc556742670d04a9bb5
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
@@ -0,0 +1,2 @@
+.idea
+.ipynb_checkpoints
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
index 0f7c97021f8ad463fc51ed169604b789ea068c3d..4b79e62f74e587fcd939d9f9e911af80992ea6a3 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
@@ -37,7 +37,7 @@ Prediction Score is 4.25
 YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成：候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选，排序网络对候选进行打分排序，输出排名最高的数十个结果。系统结构如图1所示：
 
 <p align="center">
-<img src="image/YouTube_Overview.png" width="70%" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/YouTube_Overview.png?raw=true" width="70%" ><br/>
 图1. YouTube 推荐系统结构
 </p>
 
@@ -48,7 +48,7 @@ YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐
 首先，将观看历史及搜索词记录这类历史信息，映射为向量后取平均值得到定长表示；同时，输入人口学特征以优化新用户的推荐效果，并将二值特征和连续特征归一化处理到[0, 1]范围。接下来，将所有特征表示拼接为一个向量，并输入给非线形多层感知器（MLP，详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程）处理。最后，训练时将MLP的输出给softmax做分类，预测时计算用户的综合特征（MLP的输出）与所有视频的相似度，取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。
 
 <p align="center">
-<img src="image/Deep_candidate_generation_model_architecture.png" width="70%" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/Deep_candidate_generation_model_architecture.png?raw=true" width="70%" ><br/>
 图2. 候选生成网络结构
 </p>
 
@@ -73,7 +73,7 @@ $$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
 卷积神经网络主要由卷积（convolution）和池化（pooling）操作构成，其应用及组合方式灵活多变，种类繁多。本小结我们以如图3所示的网络进行讲解：
 
 <p align="center">
-<img src="image/text_cnn.png" width = "80%" align="center"/><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/text_cnn.png?raw=true" width = "80%" align="center"/><br/>
 图3. 卷积神经网络文本分类模型
 </p>
 
@@ -107,7 +107,7 @@ $$\hat c=max(c)$$
 
 <p align="center">
 
-<img src="image/rec_regression_network.png" width="90%" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/rec_regression_network.png?raw=true" width="90%" ><br/>
 图4. 融合推荐模型
 </p>
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png
deleted file mode 100644
index c213608e769f69fb2cfe8597f8e696ee53730e3d..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png
deleted file mode 100644
index 8aedb2204371e7691140ceffa5992f6080bbf097..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png
deleted file mode 100644
index 4298567ac5600173343299999965b20612e7affe..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png
deleted file mode 100644
index a98e7cc67606b31e4c945f7eb907563e46dcef56..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png
deleted file mode 100644
index 7fd97b9cc3a0b9105b41591af4e8f8e4646bd681..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png
deleted file mode 100644
index 90c9b09fb78db98391ee199934f2d16efd6d6652..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png
deleted file mode 100644
index 6fc8e11967000ec48c1c0a6fa3c2eaecb80cbb84..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png
deleted file mode 100644
index 61e63d9147cbc2901706ef80776d706e5368c3c5..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png
deleted file mode 100644
index fbcae2be81141be955076e877b94b0ea5d7e4d4a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..667762d327cb160376a4119fa9df9db41b6443b2
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
@@ -0,0 +1,10 @@
+data/aclImdb
+data/imdb
+data/pre-imdb
+data/mosesdecoder-master
+*.log
+model_output
+dataprovider_copy_1.py
+model.list
+*.pyc
+.DS_Store
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
index 5844b6fe137c2401a04e47b5b489434ee9b363f1..8477cf32146c33947ced447c8bdd287a3e1e71f5 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
@@ -37,7 +37,7 @@
 循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上，循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据（词序列），近年来，循环神经网络及其变体（如long short term memory\[[5](#参考文献)\]等）在自然语言处理的多个领域，如语言模型、句法解析、语义角色标注（或一般的序列标注）、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。
 
 <p align="center">
-<img src="image/rnn.png" width = "60%" align="center"/><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/rnn.png?raw=true" width = "60%" align="center"/><br/>
 图1. 循环神经网络按时间展开的示意图
 </p>
 
@@ -66,7 +66,7 @@ $$ h_t = o_t\odot tanh(c_t) $$
 其中，$i_t, f_t, c_t, o_t$分别表示输入门，遗忘门，记忆单元及输出门的向量值，带角标的$W$及$b$为模型参数，$tanh$为双曲正切函数，$\odot$表示逐元素（elementwise）的乘法操作。输入门控制着新输入进入记忆单元$c$的强度，遗忘门控制着记忆单元维持上一时刻值的强度，输出门控制着输出记忆单元的强度。三种门的计算方式类似，但有着完全不同的参数，它们各自以不同的方式控制着记忆单元$c$，如图2所示：
 
 <p align="center">
-<img src="image/lstm.png" width = "65%" align="center"/><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/lstm.png?raw=true" width = "65%" align="center"/><br/>
 图2. 时刻$t$的LSTM [7]
 </p>
 
@@ -83,7 +83,7 @@ $$ h_t=Recrurent(x_t,h_{t-1})$$
 如图3所示（以三层为例），奇数层LSTM正向，偶数层LSTM反向，高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入，对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示（这一表示充分融合了文本的上下文信息，并且对文本进行了深层次抽象），最后我们将文本表示连接至softmax构建分类模型。
 
 <p align="center">
-<img src="image/stacked_lstm.jpg" width=450><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/stacked_lstm.jpg?raw=true" width=450><br/>
 图3. 栈式双向LSTM用于文本分类
 </p>
 
@@ -149,6 +149,8 @@ def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):
 
 网络的输入`input_dim`表示的是词典的大小，`class_dim`表示类别数。这里，我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。
 
+<a name="栈值双向LSTM"></a>
+
 ### 栈式双向LSTM
 
 栈式双向神经网络`stacked_lstm_net`的代码片段如下：
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png
deleted file mode 100644
index 98fbea413a98a619004ca669c67f5f867fe974c9..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png
deleted file mode 100644
index d73a00bf2c1fca2f9b8c26bccf5ea844fa1db50b..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png
deleted file mode 100644
index 26c904102a6e6c4e30f0048b81373ae8c148b355..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg
deleted file mode 100644
index 6b2adf70f2b5112a2e82505da5cff9f5fd0c6298..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png
deleted file mode 100644
index 8b5dbd726178b5555c513294e7b10a81acc96ff5..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a620e0279c310d213d4e6d8e99e666962c11e352
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
@@ -0,0 +1,3 @@
+data/train.list
+data/test.list
+data/simple-examples*
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
index d21c7ddcc501f863b5ce672123dbbc6c26528f15..904d99fe2ffc9ead69a86c9763568a5c098348d5 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
@@ -34,7 +34,7 @@ $$X = USV^T$$
 本章中，当词向量训练好后，我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影（如下图所示）。从图中可以看出，语义相关的词语（如a, the, these; big, huge）在投影上距离很近，语意无关的词（如say, business; decision, japan）在投影上的距离很远。
 
 <p align="center">
-    <img src = "image/2d_similarity.png" width=400><br/>
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/2d_similarity.png?raw=true" width=400><br/>
     图1. 词向量的二维投影
 </p>
 
@@ -50,7 +50,7 @@ similarity: -0.0997506977351
 
 ```
 
-以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到，我们将在[应用模型](#应用模型)中详细描述用法。
+以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到，我们将在[模型应用](#模型应用)中详细描述用法。
 
 
 ## 模型概览
@@ -90,7 +90,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
 其中$f(w_t, w_{t-1}, ..., w_{t-n+1})$表示根据历史n-1个词得到当前词$w_t$的条件概率，$R(\theta)$表示参数正则项。
 
 <p align="center">
-       <img src="image/nnlm.png" width=500><br/>
+       <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/nnlm.png?raw=true" width=500><br/>
        图2. N-gram神经网络模型
 </p>
 
@@ -122,7 +122,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
 CBOW模型通过一个词的上下文（各N个词）预测当前词。当N=2时，模型如下图所示：
 
 <p align="center">
-    <img src="image/cbow.png" width=250><br/>
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/cbow.png?raw=true" width=250><br/>
     图3. CBOW模型
 </p>
 
@@ -137,7 +137,7 @@ $$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
 CBOW的好处是对上下文词语的分布在词向量上进行了平滑，去掉了噪声，因此在小数据集上很有效。而Skip-gram的方法中，用一个词预测其上下文，得到了当前词上下文的很多样本，因此可用于更大的数据集。
 
 <p align="center">
-    <img src="image/skipgram.png" width=250><br/>
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/skipgram.png?raw=true" width=250><br/>
     图4. Skip-gram模型
 </p>
 
@@ -189,12 +189,13 @@ dream that one day <e>
 
 最后，每个输入会按其单词次在字典里的位置，转化成整数的索引序列，作为PaddlePaddle的输入。
 
+<a name="训练模型"></a>
 ## 编程实现
 
 本配置的模型结构如下图所示：
 
 <p align="center">
-    <img src="image/ngram.png" width=400><br/>
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/ngram.png?raw=true" width=400><br/>
     图5. 模型配置中的N-gram神经网络模型
 </p>
 
@@ -349,6 +350,7 @@ Step 20: Average Cost 5.766995
 ...
 ```
 
+<a name="模型应用"></a>
 ## 模型应用
 在模型训练后，我们可以用它做一些预测。
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png
deleted file mode 100644
index 384f59919a2c8dedb198e97d51434616648932e1..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png
deleted file mode 100644
index 76b7d4bc0f99372465bd9aa34721513d39ad0776..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png
deleted file mode 100644
index d985c393e618e9b79df05e4ff0ae57ccc93744d0..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png
deleted file mode 100644
index 2e16ab2f443732b8ef5404a8e7cd2457bc5eee23..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png
deleted file mode 100644
index 2449dce6a86b43b1b997ff418ed0dba56848463f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png
deleted file mode 100644
index 1e0b40a8f7aefdf46d42761305511f281c08e595..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png
deleted file mode 100644
index 158bd64b8f8729dea67834a8d591d21bce8b8564..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png
deleted file mode 100644
index ce4a8bf4769183cbaff91793753d2350a3ce936c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png
deleted file mode 100644
index a3ab385845d3dc8b5c670bae91225bc8dd47a8bb..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png
deleted file mode 100644
index 3c36c6d1f66eb98ea78c0673965d02a4ee3aa288..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
index 27d25b43961ce74d73e391b735369501fb80a231..9574dbea2f9a39bb196b61bb4fd12ba7c378f75a 100644
--- a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
@@ -15,7 +15,7 @@ $$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b,  i=1,\ldo
 ## 效果展示
 我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中，每个点的横坐标表示同一类房屋真实价格的中位数，纵坐标表示线性回归模型根据特征预测的结果，当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确，则点离虚线越近。
 <p align="center">
-    <img src = "image/predictions.png" width=400><br/>
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/predictions.png?raw=true" width=400><br/>
     图1. 预测值 V.S. 真实值
 </p>
 
@@ -40,13 +40,9 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
 ### 训练过程
 
 定义好模型结构之后，我们要通过以下几个步骤进行模型训练
-
  1. 初始化参数，其中包括权重$\omega_i$和偏置$b$，对其进行初始化（如0均值，1方差）。
-
  2. 网络正向传播计算网络输出和损失函数。
-
  3. 根据损失函数进行反向误差传播 （[backpropagation](https://en.wikipedia.org/wiki/Backpropagation)），将网络误差从输出层依次向前传递, 并更新网络中的参数。
-
  4. 重复2~3步骤，直至网络训练误差达到规定的程度或训练轮次达到设定值。
 
 ## 数据集
@@ -84,7 +80,7 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
 - 很多的机器学习技巧/模型（例如L1，L2正则项，向量空间模型-Vector Space Model）都基于这样的假设：所有的属性取值都差不多是以0为均值且取值范围相近的。
 
 <p align="center">
-    <img src = "image/ranges.png" width=550><br/>
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/ranges.png?raw=true" width=550><br/>
     图2. 各维属性的取值范围
 </p>
 
@@ -199,10 +195,12 @@ step = 0
 def event_handler_plot(event):
     global step
     if isinstance(event, fluid.EndStepEvent):
-        if event.step % 10 == 0: # record the test cost every 10 seconds
+        if step % 10 == 0:   # record a train cost every 10 batches
+            plot_cost.append(train_title, step, event.metrics[0])
+
+        if step % 100 == 0:  # record a test cost every 100 batches
             test_metrics = trainer.test(
                 reader=test_reader, feed_order=feed_order)
-
             plot_cost.append(test_title, step, test_metrics[0])
             plot_cost.plot()
 
@@ -210,12 +208,13 @@ def event_handler_plot(event):
                 # If the accuracy is good enough, we can stop the training.
                 print('loss is less than 10.0, stop')
                 trainer.stop()
-
-        # We can save the trained parameters for the inferences later
-        if params_dirname is not None:
-            trainer.save_params(params_dirname)
-
         step += 1
+
+    if isinstance(event, fluid.EndEpochEvent):
+        if event.epoch % 10 == 0:
+            # We can save the trained parameters for the inferences later
+            if params_dirname is not None:
+                trainer.save_params(params_dirname)
 ```
 
 ### 开始训练
@@ -231,11 +230,10 @@ trainer.train(
     event_handler=event_handler_plot,
     feed_order=feed_order)
 ```
-
-<p align="center">
-    <img src = "image/train_and_test1.png" width=400><br/>
-    图3. 训练结果
-</p>
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/train_and_test.png?raw=true" width="400"><br/>
+图3 训练结果
+</div>
 
 
 ## 预测
@@ -262,18 +260,18 @@ inferencer = fluid.Inferencer(
 batch_size = 10
 test_reader = paddle.batch(paddle.dataset.uci_housing.test(),batch_size=batch_size)
 test_data = test_reader().next()
-test_feat = numpy.array([data[0] for data in test_data]).astype("float32")
-test_label = numpy.array([data[1] for data in test_data]).astype("float32")
+test_x = numpy.array([data[0] for data in test_data]).astype("float32")
+test_y = numpy.array([data[1] for data in test_data]).astype("float32")
 
-results = inferencer.infer({'x': test_feat})
+results = inferencer.infer({'x': test_x})
 
 print("infer results: (House Price)")
-for k in range(0, batch_size-1):
-    print("%d. %f" % (k, results[0][k]))
+for idx, val in enumerate(results[0]):
+    print("%d: %.2f" % (idx, val))
 
 print("\nground truth:")
-for k in range(0, batch_size-1):
-    print("%d. %f" % (k, test_label[k]))
+for idx, val in enumerate(test_y):
+    print("%d: %.2f" % (idx, val))
 ```
 
 ## 总结
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png
deleted file mode 100644
index 27e4acb1313794f52ad9ad9e874cdadd197ff41f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions_en.png
deleted file mode 100644
index f111c7cd766b7e9981513cc8c65be87dbbf3a79e..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png
deleted file mode 100644
index 5325df4800985983e17476f007658d1cdb170b1c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges_en.png
deleted file mode 100644
index 6d6a079bfdcc33617f6cf36612b271b48be6304f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test1.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test1.png
deleted file mode 100644
index bcd304a6a0baf30ecfbc43e08fc0aca179d05958..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test1.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
index 3289116991cb8ebaa4a6fb78e100ce16f633d69c..ac36c4ecf6b9b716fe5f0dbe2346e64918c22242 100644
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
@@ -6,8 +6,8 @@
 当我们学习编程的时候，编写的第一个程序一般是实现打印"Hello World"。而机器学习（或深度学习）的入门教程，一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题，比较简单，同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集，包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵，标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。
 
 <p align="center">
-    <img src="image/mnist_example_image.png" width="400"><br/>
-    图1. MNIST图片示例
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mnist_example_image.png?raw=true" width="400"><br/>
+图1. MNIST图片示例
 </p>
 
 MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3（SD-3）和Special Database 1（SD-1）构建而来。由于SD-3是由美国人口调查局的员工进行标注，SD-1是由美国高中生进行标注，因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集（60000条数据）和测试集（10000条数据），其中训练集来自250位不同的标注员，此外还保证了训练集和测试集的标注员是不完全相同的。
@@ -40,12 +40,12 @@ $$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
 
 在分类问题中，我们一般采用交叉熵代价损失函数（cross entropy loss），公式如下：
 
-$$  L_{cross-entropy} (label, y) = -\sum_i label_ilog(y_i) $$
+$$  L_{cross-entropy}(label, y) = -\sum_i label_ilog(y_i) $$
 
 图2为softmax回归的网络图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
 
 <p align="center">
-<img src="image/softmax_regression.png" width=400><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/softmax_regression.png?raw=true" width=400><br/>
 图2. softmax回归网络结构图<br/>
 </p>
 
@@ -54,16 +54,14 @@ $$  L_{cross-entropy} (label, y) = -\sum_i label_ilog(y_i) $$
 Softmax回归模型采用了最简单的两层神经网络，即只有输入层和输出层，因此其拟合能力有限。为了达到更好的识别效果，我们考虑在输入层和输出层中间加上若干个隐藏层\[[10](#参考文献)\]。
 
 1.  经过第一个隐藏层，可以得到 $ H_1 = \phi(W_1X + b_1) $，其中$\phi$代表激活函数，常见的有sigmoid、tanh或ReLU等函数。
-
 2.  经过第二个隐藏层，可以得到 $ H_2 = \phi(W_2H_1 + b_2) $。
-
 3.  最后，再经过输出层，得到的$Y=\text{softmax}(W_3H_2 + b_3)$，即为最后的分类结果向量。
 
 
 图3为多层感知器的网络结构图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
 
 <p align="center">
-<img src="image/mlp.png" width=500><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mlp.png?raw=true" width=500><br/>
 图3. 多层感知器网络结构图<br/>
 </p>
 
@@ -72,7 +70,7 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 在多层感知器模型中，将图像展开成一维向量输入到网络中，忽略了图像的位置和结构信息，而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构：输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。
 
 <p align="center">
-<img src="image/cnn.png"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/cnn.png?raw=true" width="400"><br/>
 图4. LeNet-5卷积神经网络结构<br/>
 </p>
 
@@ -81,7 +79,7 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积，即离散二维滤波器（也称作卷积核）与二维图像做卷积操作，简单的讲是二维滤波器滑动到二维图像上所有位置，并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域，不同卷积核可以提取不同的特征，例如边沿、线性、角等特征。在深层卷积神经网络中，通过卷积操作可以提取出图像低级到复杂的特征。
 
 <p align="center">
-<img src="image/conv_layer.png" width='750'><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/conv_layer.png?raw=true" width='750'><br/>
 图5. 卷积层图片<br/>
 </p>
 
@@ -98,16 +96,15 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 #### 池化层
 
 <p align="center">
-<img src="image/max_pooling.png" width="400px"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/max_pooling.png?raw=true" width="400px"><br/>
 图6. 池化层图片<br/>
 </p>
 
 池化是非线性下采样的一种形式，主要作用是通过减少网络的参数来减小计算量，并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域，对于每个矩形框的数取最大值作为输出层，如图6所示。
 
-更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。
-
-### 常见激活函数介绍
+更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类]( https://github.com/PaddlePaddle/book/tree/develop/03.image_classification )教程。
 
+### 常见激活函数介绍  
 - sigmoid激活函数： $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $
 
 - tanh激活函数： $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $
@@ -136,20 +133,18 @@ PaddlePaddle在API中提供了自动加载[MNIST](http://yann.lecun.com/exdb/mni
 我们建议使用 Fluid API，因为它更容易学起来。
 
 下面是快速的 Fluid API 概述。
-
 1. `inference_program`：指定如何从数据输入中获得预测的函数。
 这是指定网络流的地方。
 
-2. `train_program`：指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。
+1. `train_program`：指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。
 这是指定损失计算的地方。
 
-3. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。
+1. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。
 
-4. `Trainer`：PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。
+1. `Trainer`：PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。
 通过 `event_handler` 回调函数，用户可以监控培训的进展。
 
-5. `Inferencer`：Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
-
+1. `Inferencer`：Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
 然后，它可以推断数据和返回预测。
 
 在这个演示中，我们将深入了解它们。
@@ -240,6 +235,7 @@ def train_program():
     acc = fluid.layers.accuracy(input=predict, label=label)
     return [avg_cost, acc]
 
+
 ```
 
 #### Optimizer Function 配置
@@ -255,9 +251,9 @@ def optimizer_program():
 
 下一步，我们开始训练过程。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数，每次调用的时候返回一个Python yield generator。
 
-下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B 。reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。
+下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B。reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。
 
-`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader 。在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。
+`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader。在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。
 
 ```python
 train_reader = paddle.batch(
@@ -280,7 +276,6 @@ place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
 trainer = fluid.Trainer(
     train_func=train_program, place=place, optimizer_func=optimizer_program)
-
 ```
 
 #### Event Handler 配置
@@ -315,11 +310,10 @@ def event_handler(event):
 
 `event_handler_plot` 可以用来在训练过程中画图如下：
 
-
-<p align="center">
-<img src="image/train_and_test2.png" width="400"><br/>
-图7. 训练结果
-</p>
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/train_and_test.png?raw=true" width="400"><br/>
+图7 训练结果
+</div>
 
 
 ```python
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png
deleted file mode 100644
index 3f5cdaacdc6acce41c5c6c99649be46685cf9903..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_en.png
deleted file mode 100644
index bc1a9a4ccf81972dc0d69cf4c808a52218e14d61..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png
deleted file mode 100644
index 65bd17eacd41bbdbdb042bd1ba366eb53663b410..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log_en.png
deleted file mode 100644
index 77524754df906ab096e120bd657449f4565c3418..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/conv_layer.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/conv_layer.png
deleted file mode 100644
index c751892ba0be3ae803b5933c3f33487ecfb6fe7f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/conv_layer.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png
deleted file mode 100644
index 030cd60d3b4af9aecd4941204da4ad15f6e1189f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png
deleted file mode 100644
index 90b02fa2a735cfcc9efb2de90906325dedcb358c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling_en.png
deleted file mode 100644
index c626723512b6ee02abd55e5bab65e7629d130522..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png
deleted file mode 100644
index 9f4d26cd8da32201d0a5e9c72d466301dd2b42a1..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_en.png
deleted file mode 100644
index 1fedea6a75abbf132cbbcf8ab10ce045997d697a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png
deleted file mode 100644
index f5a478fdc24f29c17555a2f1451f3f5a079faed9..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log_en.png
deleted file mode 100644
index 7d5508a1eccfcea1925f438043ee93b57769bebf..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png
deleted file mode 100644
index 4edd7cabf8a2282f6392ac1421c7ca4afb288589..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png
deleted file mode 100644
index 40b98298288b9c406fce1cbca9c913753020a94d..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression_en.png
deleted file mode 100644
index 833d3c663c94dd2d57fd19686949ded37a91f541..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png
deleted file mode 100644
index 47204941af7f22e68386a70a06ec4f122b83e262..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log_en.png
deleted file mode 100644
index 6fa0a951d5262effb707e3e15af8cb900e5560b8..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test2.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test2.png
deleted file mode 100644
index 5cb87b450d0398bcfaec0e647c362052069797e7..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test2.png and /dev/null differ
diff --git a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
index a2f30823a6fcd379f94e6e98d043b0d00681827f..99f8bee5ca1519ccf5d7c35ad2a64da4a8841ada 100644
--- a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
+++ b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
@@ -104,6 +104,7 @@ visualDL --logdir=scratch_log --port=8080
 
 # 访问 http://127.0.0.1:8080
 ```
+如果出现`TypeError: __init__() got an unexpected keyword argument 'file'`, 是因为protobuf不是3.5以上，运行`pip install --upgrade protobuf`就能解决。
 
 如果在虚拟环境下仍然遇到安装问题，请尝试以下方法。
 
@@ -149,7 +150,7 @@ python setup.py bdist_wheel
 pip install --upgrade dist/visualdl-*.whl
 ```
 
-如果打包和安装遇到其他问题，不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/how_to_dev_frontend_en.md)
+如果打包和安装遇到其他问题，不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/develop/how_to_dev_frontend_cn.md)
 
 
 ## SDK
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/build_and_install_lib_cn.rst b/doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst
similarity index 100%
rename from doc/fluid/new_docs/advanced_usage/deploy/build_and_install_lib_cn.rst
rename to doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst
diff --git a/doc/fluid/new_docs/user_guides/howto/inference/index.rst b/doc/fluid/new_docs/user_guides/howto/inference/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..45e1a2883773b92ed47ef8d51417bbdcd060b4ec
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/inference/index.rst
@@ -0,0 +1,11 @@
+############
+模型预测部署
+############
+
+PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线
+
+.. toctree::
+   :maxdepth: 2
+
+   build_and_install_lib_cn.rst
+   native_infer.rst
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
similarity index 97%
rename from doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
rename to doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
index aa9377c112856693cda72779bd399f2415d716f0..6d6f3035c0b5c985cd39d45df9f1bcce50dcefa0 100644
--- a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
+++ b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
@@ -4,12 +4,13 @@ Paddle 预测 API
 为了更简单方便的预测部署，Fluid 提供了一套高层 API
 用来隐藏底层不同的优化实现。
 
-`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`__
+`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`_
 包括
 
 -  头文件 ``paddle_inference_api.h`` 定义了所有的接口
 -  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
 
+
 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
 
 下面是一些 API 概念的介绍
@@ -95,7 +96,7 @@ engine
     CHECK(predictor->Run(slots, &outputs));
     // 获取 outputs ...
 
-编译时，联编 ``libpaddle_fluid.a/.so`` 即可。
+编译时，联编 ``libpaddle_fluid.a/.so`` 便可。
 
 详细代码参考
 ------------
diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
index 56fa928029903f1e3bd3e8064c146797f01b2b85..cca3684b78518867eae95d82e1347b52427ddc81 100644
--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
@@ -38,7 +38,6 @@ PaddlePaddle Fluid支持两种传入数据的方式:
    :maxdepth: 2
 
    feeding_data
-   use_recordio_reader
 
 Python Reader
 #############
diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst b/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
deleted file mode 100644
index dfda33f1b03516fe2c704f55d095955282b19109..0000000000000000000000000000000000000000
--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
+++ /dev/null
@@ -1,167 +0,0 @@
-.. _user_guide_use_recordio_as_train_data:
-
-############################
-使用RecordIO文件作为训练数据
-############################
-
-相比于 :ref:`user_guide_use_numpy_array_as_train_data`，
-:ref:`user_guide_use_recordio_as_train_data` 的性能更好；
-但是用户需要先将训练数据集转换成RecordIO文件格式，再使用
-:code:`fluid.layers.open_files()` 层在神经网络配置中导入 RecordIO 文件。
-用户还可以使用 :code:`fluid.layers.double_buffer()` 加速数据从内存到显存的拷贝，
-使用 :code:`fluid.layers.Preprocessor` 工具进行数据增强。
-
-将训练数据转换成RecordIO文件格式
-################################
-
-:code:`fluid.recordio_writer` 中，每个记录都是一个
-:code:`vector<LoDTensor>`, 即一个支持序列信息的Tensor数组。这个数组包括训练所需
-的所有特征。例如对于图像分类来说，这个数组可以包含图片和分类标签。
-
-用户可以使用 :code:`fluid.recordio_writer.convert_reader_to_recordio_file()` 可以将
-:ref:`user_guide_reader` 转换成一个RecordIO文件。或者可以使用
-:code:`fluid.recordio_writer.convert_reader_to_recordio_files()` 将一个
-:ref:`user_guide_reader` 转换成多个RecordIO文件。
-
-具体使用方法为:
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   import numpy
-
-   def reader_creator():
-       def __impl__():
-           for i in range(1000):
-               yield [
-                        numpy.random.random(size=[3,224,224], dtype="float32"),
-                        numpy.random.random(size=[1], dtype="int64")
-                     ]
-       return __impl__
-
-   img = fluid.layers.data(name="image", shape=[3, 224, 224])
-   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-   feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
-
-   BATCH_SIZE = 32
-   reader = paddle.batch(reader_creator(), batch_size=BATCH_SIZE)
-   fluid.recordio_writer.convert_reader_to_recordio_file(
-      "train.recordio", feeder=feeder, reader_creator=reader)
-
-其中 :code:`reader_creator` 创建了一个 :code:`Reader`。
-:ref:`_api_fluid_data_feeder_DataFeeder`
-是将 :code:`Reader` 转换成 :code:`LoDTensor` 的工具。详细请参考
-:ref:`user_guide_reader` 。
-
-上述程序将 :code:`reader_creator` 的数据转换成了 :code:`train.recordio` 文件，
-其中每一个record 含有 32 条样本。如果batch size会在训练过程中调整，
-用户可以将每一个Record的样本数设置成1。并参考
-:ref:`user_guide_use_recordio_as_train_data_use_op_create_batch`。
-
-
-配置神经网络, 打开RecordIO文件
-##############################
-
-RecordIO文件转换好之后，用户可以使用 :code:`fluid.layers.open_files()`
-打开文件，并使用 :code:`fluid.layers.read_file` 读取文件内容。
-简单使用方法如下:
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-
-   file_obj = fluid.layers.open_files(
-     filenames=["train.recordio"],
-     shape=[[3, 224, 224], [1]],
-     lod_levels=[0, 0],
-     dtypes=["float32", "int64"],
-     pass_num=100
-   )
-
-   image, label = fluid.layers.read_file(file_obj)
-
-其中如果设置了 :code:`pass_num` ，那么当所有数据读完后，会重新读取数据，
-直到读取了 :code:`pass_num` 遍。
-
-
-
-进阶使用
-########
-
-
-使用 :code:`fluid.layers.double_buffer()`
-------------------------------------------
-
-:code:`Double buffer` 使用双缓冲技术，将训练数据从内存中复制到显存中。配置双缓冲
-需要使用 :code:`fluid.layers.double_buffer()` 修饰文件对象。 例如:
-
-.. code-block:: python
-
-   import paddle.fliud as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fluid.layers.double_buffer(file_obj)
-
-   image, label = fluid.layers.read_file(file_obj)
-
-双缓冲技术可以参考
-`Multiple buffering <https://en.wikipedia.org/wiki/Multiple_buffering>`_ 。
-
-配置数据增强
-------------
-
-使用 :code:`fluid.layers.Preprocessor` 可以配置文件的数据增强方法。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   preprocessor = fluid.layers.Preprocessor(reader=data_file)
-   with preprocessor.block():
-       image, label = preprocessor.inputs()
-       image = image / 2
-       label = label + 1
-       preprocessor.outputs(image, label)
-
-如上代码所示，使用 :code:`Preprocessor` 定义了一个数据增强模块，并在
-:code:`with preprocessor.block()` 中定义了数据增强的具体操作。 用户通过配置
-:code:`preprocessor.inputs()` 获得数据文件中的各个字段。 并用
-:code:`preprocessor.outputs()` 标记预处理后的输出。
-
-.. _user_guide_use_recordio_as_train_data_use_op_create_batch:
-
-使用Op组batch
--------------
-
-使用 :code:`fluid.layers.batch()` 可以在训练的过程中动态的组batch。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fluid.layers.batch(file_obj, batch_size=32)
-
-   img, label = fluid.layers.read_file(file_obj)
-
-需要注意的是，如果数据集中的最后几个样本不能组成 :code:`batch_size` 大小的批量数据，
-那么这几个样本直接组成一个批量数据进行训练。
-
-读入数据的shuffle
------------------
-
-使用 :code:`fluid.layers.shuffle()` 可以在训练过程中动态重排训练数据。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fliud.layers.shuffle(file_obj, buffer_size=8192)
-
-   img, label = fliud.layers.read_file(file_obj)
-
-需要注意的是:
-
-1. :code:`shuffle` 实现方法是:
-先读入 :code:`buffer_size` 条样本，再随机的选出样本进行训练。
-
-2. :code:`shuffle` 中 :code:`buffer_size` 会占用训练内存，需要确定训练过程中内存
-足够支持缓存 :code:`buffer_size` 条数据。
diff --git a/doc/fluid/new_docs/user_guides/index.rst b/doc/fluid/new_docs/user_guides/index.rst
index 453cb71cfdf72e031ce0f0517e2db936eca38dfc..377631109d8f65c149b12cd2a0e4da920fdf4def 100644
--- a/doc/fluid/new_docs/user_guides/index.rst
+++ b/doc/fluid/new_docs/user_guides/index.rst
@@ -15,4 +15,5 @@
     howto/training/index
     howto/debug/index
     howto/evaluation/index
+    howto/inference/index
     models/index.rst
diff --git a/doc/survey/dynamic_graph.md b/doc/survey/dynamic_graph.md
index 6b80b014b1b1dc50f425e1296f70984c9e9b1cbd..7f62eeadff43af1f0a3c81e284a6508bf063b21e 100644
--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
@@ -2,42 +2,47 @@
 
 ## Automatic Differentiation
 
-A key challenge in the field of deep learning is to automatically derive the backward pass from the forward pass described algorithmically by researchers.  Such a derivation, or a transformation of the forward pass program, has been long studied before the recent prosperity of deep learning in the field known as [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf).
+A key challenge in deep learning is to automatically derive the backward pass given the forward pass as a program, which has been long studied in the field of [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf), or autodiff, before the prosperity of deep learning.
 
-## The Tape
+## Program Transformation v.s. Backtracking
 
-Given the forward pass program (usually in Python in practices), there are two strategies to derive the backward pass:
+Given the forward pass program, there are two strategies to derive the backward pass:
 
-1. from the forward pass program itself, or
-1. from the execution trace of the forward pass program, which is often known as the *tape*.
+1. by transforming the forward pass program without executing it, or
+1. by backtracking the execution process of the forward pass program.
 
-This article surveys systems that follow the latter strategy.
+This article is about the latter strategy. 
 
-## Dynamic Network
+## The Tape and Dynamic Networks
 
-When we train a deep learning model, the tape changes every iteration as the input data change, so we have to re-derive the backward pass every iteration.  This is known as *dynamic network*.
+We refer to the trace of the execution of the forward pass program as a *tape* [[1]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf).  When we train a deep learning model, the tape changes every iteration as the input data change, so we'd have to re-derive the backward pass, which is time-consuming, but also eases the case that the forward program includes control flows like if-else and for/while. With these control flows, the execution trace might change with iterations.  Such changes are known as *dynamic networks* in the field of deep learning.
 
-Deep learning systems that utilize the idea of dynamic network gained their popularities in recent years.  This article surveys two representative systems: [PyTorch](https://pytorch.org/) and [DyNet](https://dynet.readthedocs.io/en/latest/).
+## Typical Systems
 
-## An Overview
+Deep learning systems that utilize the idea of dynamic networks gained their popularities in recent years.  This article surveys the following typical systems: 
 
-Both frameworks record a ‘tape’ of the computation and interpreting (or run-time compiling) a transformation of the tape played back in reverse. This tape is a different kind of entity than the original program.[[link]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf)
+- [DyNet](https://dynet.readthedocs.io/en/latest/)
+- [PyTorch](https://pytorch.org/)
+- Chainer
+- Autograd from HIPS
 
-Consider the following code feedforward model.
+Before diving into these systems, let us pose an example forward pass program:
 
 ```python
 x = Variable(randn(20, 1)))
 label = Variable(randint(1))
 W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20))
 h = matmul(W_1, x)
-pred = matmul(W_2, x)
+pred = matmul(W_2, h)
 loss = softmax(pred, label)
 loss.backward()
 ```
 
-### 1) Dynet uses List to encode the Tape
+## The Representation of Tapes
 
-During the forward execution, a list of operators, in this case `matmul`, `matmul` and `softmax`, are recorded in the tape, along with the necessary information needed to do the backward such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward()`.
+### DyNet: the Tape as a List
+
+DyNet uses a linear data structure, a list, to represent the tape. During the execution of the above example, it is a list of operators: `matmul`, `matmul`, and `softmax`.  The list also includes information needed to do the backward pass, such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward().`
 
 <details> 
 <summary></summary>
@@ -69,9 +74,9 @@ digraph g {
 
 ![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20})
 
-### 2) Pytorch uses Node Graph to encode the Tape
+### PyTorch: the Tape as a Graph
 
-The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.
+The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.  Please be aware that a `Function` might have more than one `prev_func`s.
 
 <details> 
 <summary></summary>
@@ -132,27 +137,22 @@ digraph g {
 
 ![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20})
 
-Chainer and Autograd uses the similar techniques to record the forward pass. For details please refer to the appendix.
-
-## Design choices
+Chainer and Autograd use the similar techniques to record the forward pass. For details, please refer to the appendix.
 
-### 1) Dynet's List vs Pytorch's Node Graph
+## Comparison: List v.s. Graph
 
-What's good about List:
-1. It avoids a topological sort. One only needs to traverse the list of operators in reverse and calling the corresponding backward operator.
-1. It promises effient data parallelism implementations. One could count the time of usage of a certain variable during the construction list. Then in the play back, one knows the calculation of a variable has completed. This enables communication and computation overlapping.
+The list of DyNet could be considered the result of the topological sort of the graph of PyTorch. Or, the graph is the raw representation of the tape, which gives us the chance to *prune* part of the graph that is irrelevant with the backward pass before the topological sort [[2]](https://openreview.net/pdf?id=BJJsrmfCZ). Consider the following example, PyTorch only does backward on `SmallNet` while DyNet does both `SmallNet` and `BigNet`:
 
-What's good about Node Graph:
-1. More flexibility. PyTorch users can mix and match independent graphs however they like, in whatever threads they like (without explicit synchronization). An added benefit of structuring graphs this way is that when a portion of the graph becomes dead, it is automatically freed. [[2]](https://openreview.net/pdf?id=BJJsrmfCZ) Consider the following example, Pytorch only does backward on SmallNet while Dynet does both BigNet and SmallNet.
 ```python
 result = BigNet(data)
 loss = SmallNet(data)
 loss.backward()
 ```
 
-### 2) Dynet's Lazy evaluation vs Pytorch's Immediate evaluation
+## Lazy v.s. Immediate Evaluation
+
+Another difference between DyNet and PyTorch is that DyNet lazily evaluates the forward pass, whereas PyTorch executes it immediately. Consider the following example:
 
-Dynet builds the list in a symbolic matter. Consider the following example
 ```python
 for epoch in range(num_epochs):
     for in_words, out_label in training_data:
@@ -164,16 +164,17 @@ for epoch in range(num_epochs):
         loss_val = loss_sym.value()
         loss_sym.backward()
 ```
+
 The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion.
 
-Pytorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
+PyTorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
 
 
-## What can fluid learn from them?
+## Fluid: Learning the Lessons
 
 Please refer to `paddle/contrib/dynamic/`.
 
-# Appendix
+## Appendix
 
 ### Overview
 
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index ed4e67879c795258683b094cfaeaff9063d66848..ae5f30e431aba4cae04b0fb35f00bce84f18de33 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -43,6 +43,7 @@ paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list',
 paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
+paddle.fluid.Trainer.save_inference_model ArgSpec(args=['self', 'param_path', 'feeded_var_names', 'target_var_indexes'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None)
@@ -65,7 +66,7 @@ paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'pla
 paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
 paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
-paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0))
+paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0, None))
 paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
@@ -170,7 +171,9 @@ paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], vara
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
 paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
+paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
 paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@@ -310,7 +313,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
-paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
+paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -374,7 +377,7 @@ paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'l
 paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5))
 paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0))
+paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False))
 paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
diff --git a/paddle/fluid/framework/.gitignore b/paddle/fluid/framework/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..5132131e55e2feee8ae88b4c65ec102fbc9c5fe1
--- /dev/null
+++ b/paddle/fluid/framework/.gitignore
@@ -0,0 +1,2 @@
+.tensor_util.cu
+.data_type_transform.cu
\ No newline at end of file
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 0668ff43c8192f53ff7e05abaeb575e2b78b1de4..cc7938b2ac07f11ceb7f33a2e37380d1e2ed2072 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1,3 +1,22 @@
+# windows treat symbolic file as a real file, which is different with unix
+# We create a hidden file and compile it instead of origin source file.
+function(windows_symbolic TARGET)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  foreach(src ${windows_symbolic_SRCS})
+  get_filename_component(src ${src} NAME_WE)
+  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
+      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
+  endif()
+  add_custom_command(OUTPUT .${src}.cu 
+          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
+          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
+          COMMENT "create hidden file of ${src}.cu")
+  add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)  
+  endforeach()
+endfunction()
+
 add_subdirectory(ir)
 if (NOT WIN32)
 add_subdirectory(details)
@@ -11,7 +30,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+  if (WIN32)
+    windows_symbolic(tensor_util SRCS tensor_util.cu)
+    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
+    add_dependencies(tensor tensor_util)
+  else()
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+  endif(WIN32)
 else()
   cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
 endif()
@@ -55,7 +80,13 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
         DEPS operator op_registry device_context math_function)
 
 if(WITH_GPU)
-  nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  if (WIN32)
+      windows_symbolic(hidden_file SRCS data_type_transform.cu)
+      nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
+      add_dependencies(data_type_transform hidden_file)
+  else()
+      nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  endif(WIN32)
   nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
 else()
   cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index cd00b7de7338982308acfa1f1e8c38e010c6a43b..c9e3a8ac1d1e5228725bff49ecc6d91e640dfe57 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -46,7 +46,7 @@ struct CastDataLayout {
   const std::vector<int> axis_;
 
   template <typename T>
-  void operator()() {
+  void apply() {
     auto place = ctx_->GetPlace();
 
     if (platform::is_cpu_place(place)) {
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 84691a2059124960a3213802fec0863f8abe6df7..8ad2fb5f3ffd9641932bbbb024a31e81d31dc9bb 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -26,75 +26,40 @@ namespace framework {
 extern proto::VarType::Type ToDataType(std::type_index type);
 extern std::type_index ToTypeIndex(proto::VarType::Type type);
 
-#if !defined(_WIN32)
 template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
   switch (type) {
     case proto::VarType::FP16:
-      visitor.template operator()<platform::float16>();
+      visitor.template apply<platform::float16>();
       break;
     case proto::VarType::FP32:
-      visitor.template operator()<float>();
+      visitor.template apply<float>();
       break;
     case proto::VarType::FP64:
-      visitor.template operator()<double>();
+      visitor.template apply<double>();
       break;
     case proto::VarType::INT32:
-      visitor.template operator()<int>();
+      visitor.template apply<int>();
       break;
     case proto::VarType::INT64:
-      visitor.template operator()<int64_t>();
+      visitor.template apply<int64_t>();
       break;
     case proto::VarType::BOOL:
-      visitor.template operator()<bool>();
+      visitor.template apply<bool>();
       break;
     case proto::VarType::UINT8:
-      visitor.template operator()<uint8_t>();
+      visitor.template apply<uint8_t>();
       break;
     case proto::VarType::INT16:
-      visitor.template operator()<int16_t>();
+      visitor.template apply<int16_t>();
       break;
     case proto::VarType::INT8:
-      visitor.template operator()<int8_t>();
+      visitor.template apply<int8_t>();
       break;
     default:
       PADDLE_THROW("Not supported %d", type);
   }
 }
-#else
-// the msvc compiler do not implement two-stage name lookup correctly.
-template <typename Visitor>
-inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
-  switch (type) {
-    case proto::VarType::FP16:
-      visitor.operator()<platform::float16>();
-      break;
-    case proto::VarType::FP32:
-      visitor.operator()<float>();
-      break;
-    case proto::VarType::FP64:
-      visitor.operator()<double>();
-      break;
-    case proto::VarType::INT32:
-      visitor.operator()<int>();
-      break;
-    case proto::VarType::INT64:
-      visitor.operator()<int64_t>();
-      break;
-    case proto::VarType::BOOL:
-      visitor.operator()<bool>();
-      break;
-    case proto::VarType::UINT8:
-      visitor.operator()<uint8_t>();
-      break;
-    case proto::VarType::INT16:
-      visitor.operator()<int16_t>();
-      break;
-    default:
-      PADDLE_THROW("Not supported %d", type);
-  }
-}
-#endif  // _WIN32
 
 extern std::string DataTypeToString(const proto::VarType::Type type);
 extern size_t SizeOfType(std::type_index type);
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 5a57ec20585c26dbcd4251464718fc819148a7a5..d79f8cacb5f4727defc77380371e57bcea65f068 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -37,7 +37,7 @@ struct CastDataType {
   const platform::DeviceContext* ctx_;
 
   template <typename OutType>
-  void operator()() {
+  void apply() {
     auto* in_begin = in_.data<InType>();
     auto* in_end = in_begin + in_.numel();
     auto* out_begin = out_->mutable_data<OutType>(in_.place());
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 0bfff745493d069e948e6d277ec2bbfb0673a70b..7a99169849debcbc57d6f197b36c5045b211f3ef 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -326,7 +326,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   ir::Graph &result = *graph;
 
   for (auto &node : nodes) {
-    if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) {
+    if (node->IsVar() && node->Var()) {
       all_vars_.emplace(node->Name(), node->Var());
     }
   }
@@ -583,18 +583,6 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
   }
 }
 
-bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
-    const std::string &og,
-    std::unordered_set<std::string> *og_has_been_broadcast) const {
-  bool is_pg_once =
-      grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
-  if (is_pg_once) {
-    // Insert NCCL AllReduce Op
-    og_has_been_broadcast->insert(og);
-  }
-  return is_pg_once;
-}
-
 int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
                                            ir::Node *node) const {
   if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
@@ -688,20 +676,6 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
   return var;
 }
 
-// Find the first occurence of `prev_op_name` and make current `op` depend
-// on it.
-void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
-                                        const std::string &prev_op_name) const {
-  for (auto &prev_op : result->Get<GraphOps>(kGraphOps)) {
-    if (prev_op->Name() == prev_op_name) {
-      auto *dep_var = new DummyVarHandle(result->CreateControlDepVar());
-      prev_op->AddOutput(dep_var);
-      result->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-      op->AddInput(dep_var);
-    }
-  }
-}
-
 void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
                                                 ir::Node *node) const {
   int op_dev_id = -1;
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 7a6f238f9cf7af18cb10ea271e453fec1902c833..ac6d9c5a64cfde60f75c76dae0a30cc7d735e996 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -69,9 +69,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
   std::vector<std::string> FindDistTrainRecvVars(
       const std::vector<ir::Node *> &nodes) const;
 
-  void ConnectOp(ir::Graph *result, OpHandleBase *op,
-                 const std::string &prev_op_name) const;
-
   void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                               size_t num_places) const;
 
@@ -83,10 +80,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
   void CreateComputationalOp(ir::Graph *result, ir::Node *node,
                              int dev_id) const;
 
-  bool IsParameterGradientOnce(
-      const std::string &og,
-      std::unordered_set<std::string> *og_has_been_broadcast) const;
-
   int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;
 
   void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index e28264eb32756f77ef5baed3dff77ba9f0943160..bd6153c0c736f6e32378eebcbf6c4d7e402c9b42 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -31,7 +31,7 @@ struct ReduceLoDTensor {
       : src_tensors_(src), dst_tensor_(*dst) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     PADDLE_ENFORCE(!src_tensors_.empty());
     auto &t0 = *src_tensors_[0];
     PADDLE_ENFORCE_NE(t0.numel(), 0);
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index bfc649017f19d67660bd11d590134cf56772bb27..78387c407398b58d3fab6eab12445c4198f809b5 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -1,20 +1,44 @@
+set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt.  DO NOT EDIT!\n\n")
+file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
+
+
+# Usage: pass_library(target inference) will append to paddle_inference_pass.h
+function(pass_library TARGET DEST)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass ${op_library_DEPS})
+    # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
+    if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
+        message(STATUS "add pass ${TARGET} ${DEST}")
+        file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
+        set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
+    endif()
+endfunction()
+
 cc_library(node SRCS node.cc DEPS proto_desc)
 cc_library(graph SRCS graph.cc DEPS node)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
-cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
-cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
-cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector)
-cc_library(attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector)
-cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass)
-cc_library(fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector)
-cc_library(seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector)
+
+pass_library(graph_to_program_pass base)
+pass_library(graph_viz_pass base)
+pass_library(fc_fuse_pass inference)
+pass_library(attention_lstm_fuse_pass inference)
+pass_library(infer_clean_graph_pass inference)
+pass_library(fc_lstm_fuse_pass inference)
+pass_library(fc_gru_fuse_pass inference)
+pass_library(seq_concat_fc_fuse_pass inference)
+
+set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
 
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
-cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto)
+cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index d2d051a69a33a38535e67227d4cc62f5b35e430c..bb52d7e498e55c02ddc2cd6d07ccccd51ce4edc5 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
+#include <string>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/api/helper.h"
 
 namespace paddle {
 namespace framework {
@@ -96,17 +96,13 @@ void FindWhileOp(Graph* graph) {
   auto* cell_init = graph->RetriveNode(6);
   auto* hidden_init = graph->RetriveNode(8);
 
-#define LINK_TO(node0, node1)      \
-  node0->outputs.push_back(node1); \
-  node1->inputs.push_back(node0);
-
   auto* lstm_op = graph->CreateOpNode(&op_desc);
   PrepareParameters(graph, param);
 
-  LINK_TO(X, lstm_op);
-  LINK_TO(cell_init, lstm_op);
-  LINK_TO(hidden_init, lstm_op);
-  LINK_TO(lstm_op, LSTMOUT);
+  IR_NODE_LINK_TO(X, lstm_op);
+  IR_NODE_LINK_TO(cell_init, lstm_op);
+  IR_NODE_LINK_TO(hidden_init, lstm_op);
+  IR_NODE_LINK_TO(lstm_op, LSTMOUT);
 
   GraphSafeRemoveNodes(graph, marked_nodes);
 }
@@ -216,11 +212,11 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
 
   float* out_data = out->mutable_data<float>(platform::CPUPlace());
   std::array<const float*, 4> tensors(
-      {W_forget_w0.data<float>(), W_input_w0.data<float>(),
-       W_output_w0.data<float>(), W_cell_w0.data<float>()});
+      {{W_forget_w0.data<float>(), W_input_w0.data<float>(),
+        W_output_w0.data<float>(), W_cell_w0.data<float>()}});
   std::array<const float*, 4> tensors1(
-      {W_forget_w1.data<float>(), W_input_w1.data<float>(),
-       W_output_w1.data<float>(), W_cell_w1.data<float>()});
+      {{W_forget_w1.data<float>(), W_input_w1.data<float>(),
+        W_output_w1.data<float>(), W_cell_w1.data<float>()}});
 
   for (int row = 0; row < D; row++) {
     for (int col = 0; col < 4; col++) {
@@ -243,8 +239,8 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
                      const LoDTensor& B_output, const LoDTensor& B_cell,
                      LoDTensor* out) {
   std::array<const float*, 4> tensors(
-      {B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
-       B_cell.data<float>()});
+      {{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
+        B_cell.data<float>()}});
 
   PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
   int D = B_forget.dims()[0];
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 513742bab69d465aac1bfb7bcef2fe89108c14a0..ca704c7f5631bbaa88f1bc2caaa22fd021de11c4 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -21,120 +21,51 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-bool VarOutLinksToOp(Node* node, const std::string& op_type) {
-  for (auto* out : node->outputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void BuildFCPattern(PDPattern* pattern) {
-  // Create Operators
-  auto* mul_op = pattern->NewNode("mul")->assert_is_op("mul");
-  auto* elementwise_add_op =
-      pattern->NewNode("elementwise_add")->assert_is_op("elementwise_add");
-  // Create variables
-  // w
-  auto* mul_weight_var = pattern->NewNode("mul_weight")
-                             ->AsInput()
-                             ->assert_is_op_nth_input("mul", "Y", 0);
-  // x
-  auto* mul_tmp_var = pattern->NewNode("mul_tmp_var")
-                          ->AsInput()
-                          ->assert_is_op_nth_input("mul", "X", 0);
-  // intermediate variable, will be removed in the IR after fuse.
-  auto* mul_out_var = pattern->NewNode("mul_out")
-                          ->AsIntermediate()
-                          ->assert_is_only_output_of_op("mul")
-                          ->assert_is_op_input("elementwise_add");
-  // bias
-  auto* elementwise_add_tmp_var = pattern->NewNode("elementwise_add_tmpvar")
-                                      ->assert_is_op_input("elementwise_add")
-                                      ->AsInput();
-  // output
-  auto* elementwise_add_out_var = pattern->NewNode("elementwise_add_out")
-                                      ->AsOutput()
-                                      ->assert_is_op_output("elementwise_add");
-
-  mul_op->LinksFrom({mul_weight_var, mul_tmp_var}).LinksTo({mul_out_var});
-  elementwise_add_op->LinksFrom({mul_out_var, elementwise_add_tmp_var})
-      .LinksTo({elementwise_add_out_var});
-}
-
-// Replace the node `from` in the links to `to`
-bool LinksReplace(std::vector<Node*>* links, Node* from, Node* to) {
-  for (auto*& n : *links) {
-    if (n == from) {
-      n = to;
-      return true;
-    }
-  }
-  return false;
-}
-
 std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init("fc", graph.get());
+  FusePassBase::Init("fc_fuse", graph.get());
 
   std::unordered_set<Node*> nodes2delete;
 
   GraphPatternDetector gpd;
-  BuildFCPattern(gpd.mutable_pattern());
-
-#define GET_NODE(id)                                              \
-  PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode(#id)), \
-                 "pattern has no Node called %s", #id);           \
-  auto* id = subgraph.at(gpd.pattern().RetrieveNode(#id));        \
-  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("fc_fuse/x")
+                ->AsInput()
+                ->assert_is_op_input("mul", "X");
+  patterns::FC fc_pattern(gpd.mutable_pattern(), "fc_fuse");
+  fc_pattern(x, true /*with bias*/);
 
   int found_fc_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     VLOG(4) << "handle FC fuse";
-    // Currently, there is no FC op available, so I will just simulate the
-    // scenerio.
-    // FC's fusion is simple, just op fuse, no need to process the
-    // parameters.
-    GET_NODE(mul_tmp_var);             // x
-    GET_NODE(mul_weight);              // Y
-    GET_NODE(elementwise_add_tmpvar);  // bias
-    GET_NODE(elementwise_add_out);     // Out
-    GET_NODE(mul);                     // MUL op
-    GET_NODE(elementwise_add);         // ELEMENT_ADD op
-    GET_NODE(mul_out);                 // tmp
-#undef GET_NODE
+    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
 
     // Create an FC Node.
     OpDesc desc;
-    std::string fc_x_in = mul_tmp_var->Name();
-    std::string fc_Y_in = mul_weight->Name();
-    std::string fc_bias_in = elementwise_add_tmpvar->Name();
-    std::string fc_out = elementwise_add_out->Name();
+    std::string fc_x_in = subgraph.at(x)->Name();
+    std::string fc_Y_in = w->Name();
+    std::string fc_bias_in = fc_bias->Name();
+    std::string fc_out_out = fc_out->Name();
     desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
     desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
     desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
-    desc.SetOutput("Out", std::vector<std::string>({fc_out}));
+    desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
     desc.SetType("fc");
     auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
-    fc_node->inputs =
-        std::vector<Node*>({mul_tmp_var, mul_weight, elementwise_add_tmpvar});
-    fc_node->outputs.push_back(elementwise_add_out);
-
-    // Update link relatons
-    PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node));
-    PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node));
-    PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs,
-                                elementwise_add, fc_node));
-    PADDLE_ENFORCE(
-        LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node));
+    GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
 
-    // Drop old nodes
-    graph->RemoveNode(mul);
-    graph->RemoveNode(elementwise_add);
-    graph->RemoveNode(mul_out);  // tmp variable
+    PADDLE_ENFORCE(subgraph.count(x));
+    IR_NODE_LINK_TO(subgraph.at(x), fc_node);
+    IR_NODE_LINK_TO(w, fc_node);
+    IR_NODE_LINK_TO(fc_bias, fc_node);
+    IR_NODE_LINK_TO(fc_node, fc_out);
 
     found_fc_count++;
   };
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a902b0b50cf27ff84877053aca2ff921cd00b833
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -0,0 +1,185 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+static int BuildFusion(Graph* graph, const std::string& name_scope,
+                       Scope* scope, bool with_fc_bias) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  patterns::FC fc_pattern(pattern, name_scope);
+  patterns::GRU gru_pattern(pattern, name_scope);
+
+  PDNode* x =
+      pattern->NewNode(patterns::UniqueKey("x"))->assert_var_not_persistable();
+
+  auto* fc_out = fc_pattern(x, with_fc_bias);
+  fc_out->AsIntermediate();  // fc_out is a tmp var, will be removed after fuse.
+  gru_pattern(fc_out);
+
+  // Create New OpDesc
+  auto gru_creater = [&](Node* gru, Node* x, Node* weight_x, Node* weight_h,
+                         Node* bias, Node* hidden, Node* fc_bias) {
+
+    OpDesc op_desc;
+    op_desc.SetType("fusion_gru");
+
+#define NEW_NAME(x) name_scope + "/at." #x ".new"
+#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
+    SET_IN(X, x);
+    SET_IN(WeightX, weight_x);
+    SET_IN(WeightH, weight_h);
+    if (with_fc_bias) {
+      op_desc.SetInput("Bias", {NEW_NAME(bias) + bias->Name()});
+    } else {
+      SET_IN(Bias, bias);
+    }
+#undef SET_IN
+    op_desc.SetInput("H0", {});
+    op_desc.SetOutput("Hidden", {hidden->Name()});
+    op_desc.SetAttr("is_reverse", gru->Op()->GetAttr("is_reverse"));
+    // TODO(TJ): This should be a option for infer
+    op_desc.SetAttr("use_seq", true);
+
+#define SET_IMTERMEDIATE_OUT(key) op_desc.SetOutput(#key, {NEW_NAME(key)})
+    SET_IMTERMEDIATE_OUT(ReorderedH0);
+    SET_IMTERMEDIATE_OUT(XX);
+    SET_IMTERMEDIATE_OUT(BatchedInput);
+    SET_IMTERMEDIATE_OUT(BatchedOut);
+#undef SET_IMTERMEDIATE_OUT
+
+    auto* op = graph->CreateOpNode(&op_desc);
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+    PADDLE_ENFORCE(scope);
+    if (with_fc_bias) {
+      // Fusion GRU bias = fcbias + grubias
+      auto* fusion_bias_var = scope->Var(NEW_NAME(bias) + bias->Name());
+      auto* out_bias_tensor =
+          fusion_bias_var->GetMutable<framework::LoDTensor>();
+      PADDLE_ENFORCE(fusion_bias_var);
+      auto* gru_bias_var = scope->FindVar(bias->Name());
+      auto* fc_bias_var = scope->FindVar(fc_bias->Name());
+      PADDLE_ENFORCE(gru_bias_var);
+      PADDLE_ENFORCE(fc_bias_var);
+      const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
+      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
+      // new bias = fc bias + gru bias
+      out_bias_tensor->Resize(gru_bias_tenosr.dims());
+      auto* data = out_bias_tensor->mutable_data<float>(platform::CPUPlace());
+      for (int i = 0; i < out_bias_tensor->numel(); i++) {
+        data[i] =
+            fc_bias_tensor.data<float>()[i] + gru_bias_tenosr.data<float>()[i];
+      }
+    }
+#undef GET_NODE
+
+#define NEW_IMTERMEDIATE_OUT(key) \
+  scope->Var(NEW_NAME(key))->GetMutable<framework::LoDTensor>()
+    NEW_IMTERMEDIATE_OUT(ReorderedH0);
+    NEW_IMTERMEDIATE_OUT(XX);
+    NEW_IMTERMEDIATE_OUT(BatchedInput);
+    NEW_IMTERMEDIATE_OUT(BatchedOut);
+#undef NEW_NAME
+#undef NEW_IMTERMEDIATE_OUT
+
+    IR_NODE_LINK_TO(x, op);
+    IR_NODE_LINK_TO(weight_x, op);
+    IR_NODE_LINK_TO(weight_h, op);
+    IR_NODE_LINK_TO(bias, op);  // actually should link to new bias if have
+    IR_NODE_LINK_TO(op, hidden);
+    // h0?
+    return op;
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    auto* x_n = subgraph.at(x);
+    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, gru_pattern);
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(BatchGate, BatchGate, gru_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(BatchResetHiddenPrev, BatchGate, gru_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchGate, gru_pattern);
+
+    if (with_fc_bias) {
+      GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
+
+      gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul, gru, elementwise_add, fc_bias, fc_out, mul_out, BatchGate,
+           BatchResetHiddenPrev, BatchHidden});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    } else {
+      gru_creater(gru, x_n, w, Weight, Bias, Hidden, nullptr);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul, gru, BatchGate, BatchResetHiddenPrev, BatchHidden});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    }
+#undef GET_NODE
+
+    ++fusion_count;
+  };
+
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+std::unique_ptr<ir::Graph> MulGRUFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 false /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+std::unique_ptr<ir::Graph> FCGRUFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 true /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
+REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..63e1c72bfb2e2641ae5d44858b342d5e427e9045
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
+
+class FCGRUFusePass : public FusePassBase {
+ public:
+  virtual ~FCGRUFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  const std::string name_scope_{"fc_gru_fuse"};
+};
+
+// Just FC without bias
+class MulGRUFusePass : public FusePassBase {
+ public:
+  virtual ~MulGRUFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"fc_nobias_gru_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index 5852705b6b8d1c650faeae3dc810aac65353b459..f7fda873574a0f8b10251d4fa6b604a9312ad7f9 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -13,109 +13,167 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
+int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
+                bool with_fc_bias) {
   GraphPatternDetector gpd;
   auto* pattern = gpd.mutable_pattern();
 
-  std::unordered_set<int> fused_ops({// first lstm
-                                     13, 15, 16,
-                                     // second lstm
-                                     23, 25, 26});
-
-  pattern->NewNode([&](Node* x) { return fused_ops.count(x->id()); },
-                   "any_node");
-
-  std::unordered_set<Node*> marked_nodes;
+  // Build pattern
+  PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
+                  ->assert_is_op_input("mul")
+                  ->assert_var_not_persistable();
+  patterns::FC fc_pattern(pattern, name_scope);
 
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-
-    auto* id = subgraph.at(gpd.pattern().RetrieveNode("any_node"));
-    marked_nodes.insert(id);
-  };
-  gpd(graph.get(), handler);
+  // fc_out is a tmp var, will be removed after fuse, so marked as intermediate.
+  auto* fc_out = fc_pattern(x, with_fc_bias)->AsIntermediate();
+  patterns::LSTM lstm_pattern(pattern, name_scope);
+  lstm_pattern(fc_out);
 
   // Create New OpDesc
-  auto lstm_creator = [&](int lstm, int input, int weight_x, int weight_h,
-                          int bias, int hidden, int cell, int xx) {
-#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
-    GET_NODE(input);
-    GET_NODE(weight_x);
-    GET_NODE(weight_h);
-    GET_NODE(bias);
-    GET_NODE(hidden);
-    GET_NODE(cell);
-    GET_NODE(xx);
-    GET_NODE(lstm);
-
+  auto lstm_creator = [&](Node* lstm, Node* input, Node* weight_x,
+                          Node* weight_h, Node* bias, Node* hidden, Node* cell,
+                          Node* xx, Node* fc_bias) {
     OpDesc op_desc;
     op_desc.SetType("fusion_lstm");
-#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()});
+#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
     SET_IN(X, input);
     SET_IN(WeightX, weight_x);
     SET_IN(WeightH, weight_h);
     SET_IN(Bias, bias);
-#undef GET_NODE
 #undef SET_IN
+    if (with_fc_bias) {
+      // Add FC-bias with LSTM-bias and create a new weight
+      PADDLE_ENFORCE(scope);
+      const std::string& new_bias_var = name_scope + "_bias.new";
+      auto* bias_var = scope->Var(new_bias_var);
+      PADDLE_ENFORCE(bias_var);
+      auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
+      auto* lstm_bias_var = scope->FindVar(bias->Name());
+      PADDLE_ENFORCE(lstm_bias_var);
+      const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
+      bias_tensor->Resize(lstm_bias_tensor.dims());
+
+      auto* fc_bias_var = scope->FindVar(fc_bias->Name());
+      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
+
+      auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace());
+
+      for (int i = 0; i < bias_tensor->numel(); i++) {
+        data[i] =
+            fc_bias_tensor.data<float>()[i] + lstm_bias_tensor.data<float>()[i];
+      }
+      op_desc.SetInput("Bias", {new_bias_var});
+    }
 
-    VLOG(4) << "hidden_n: " << hidden_n->Name();
-    VLOG(4) << "cell: " << cell_n->Name();
-    VLOG(4) << "xx: " << xx_n->Name();
+    // Create temp variables.
+    const std::string BatchedInput = patterns::UniqueKey("BatchedInput");
+    const std::string BatchedCellPreAct =
+        patterns::UniqueKey("BatchedCellPreAct");
+    const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
+
+    scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
+    scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
+    scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
 
     op_desc.SetInput("H0", {});
     op_desc.SetInput("C0", {});
-    op_desc.SetOutput("Hidden", {hidden_n->Name()});
-    op_desc.SetOutput("Cell", {cell_n->Name()});
-    op_desc.SetOutput("XX", {xx_n->Name()});
-    op_desc.SetOutput("BatchedGate", {"blstm_0.tmp_2"});
-    op_desc.SetOutput("BatchCellPreAct", {"blstm_1.tmp_2"});
-    op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
-    op_desc.SetAttr("use_peepholes", false);
-    auto* op = graph->CreateOpNode(&op_desc);
+    op_desc.SetOutput("Hidden", {hidden->Name()});
+    op_desc.SetOutput("Cell", {cell->Name()});
+    op_desc.SetOutput("XX", {xx->Name()});
+    op_desc.SetOutput("BatchedGate", {BatchedGate});
+    op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
+    op_desc.SetOutput("BatchedInput", {BatchedInput});
+    op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
+    op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
+    // TODO(TJ): get from attr
+    op_desc.SetAttr("use_seq", true);
+
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+#define OP_SET_OUT(x)                            \
+  const std::string x = patterns::UniqueKey(#x); \
+  op_desc.SetOutput(#x, {x});                    \
+  scope->Var(x)->GetMutable<LoDTensor>()
+    OP_SET_OUT(BatchedCell);
+    OP_SET_OUT(BatchedHidden);
+    OP_SET_OUT(ReorderedH0);
+    OP_SET_OUT(ReorderedC0);
+#undef OP_SET_OUT
 
-#define LINK_TO(a, b)      \
-  a->outputs.push_back(b); \
-  b->inputs.push_back(a);
-    LINK_TO(input_n, op);
-    LINK_TO(weight_x_n, op);
-    LINK_TO(weight_h_n, op);
-    LINK_TO(bias_n, op);
-    LINK_TO(op, hidden_n);
-#undef LINK_TO
+    auto* op = graph->CreateOpNode(&op_desc);
+    IR_NODE_LINK_TO(input, op);
+    IR_NODE_LINK_TO(weight_x, op);
+    IR_NODE_LINK_TO(weight_h, op);
+    IR_NODE_LINK_TO(bias, op);
+    IR_NODE_LINK_TO(op, hidden);
     return op;
+  };
 
+  int fusion_count{0};
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+
+    GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
+    if (with_fc_bias) {
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
+      lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out,
+                   fc_bias);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul, lstm, elementwise_add});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    } else {
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern);
+      lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out,
+                   nullptr);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes({mul, lstm});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    }
+
+    ++fusion_count;
   };
 
-  lstm_creator(16, 12, 14, 18, 17, 22, 21, 19);
-  lstm_creator(26, 12, 24, 28, 27, 32, 31, 29);
+  gpd(graph, handler);
 
-  // remove all the nodes
+  return fusion_count;
+}
 
-  for (auto* node : marked_nodes) {
-    graph->RemoveNode(const_cast<Node*>(node));
-  }
+std::unique_ptr<ir::Graph> MulLstmFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
 
-  for (auto* node : graph->Nodes()) {
-    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
-      if (marked_nodes.count(*it)) {
-        it = const_cast<Node*>(node)->inputs.erase(it);
-      } else
-        it++;
-    }
-    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
-      if (marked_nodes.count(*it)) {
-        it = const_cast<Node*>(node)->outputs.erase(it);
-      } else
-        it++;
-    }
-  }
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 false /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 true /*with_fc_bias*/);
 
+  AddStatis(fusion_count);
   return graph;
 }
 
@@ -123,4 +181,5 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
 }  // namespace framework
 }  // namespace paddle
 
+REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
 REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
index 74b08ae558b12c9328db58687cd01edbc37291a8..3ee32c63a46fcc34bdccd1e14d4bbaf9668c49e9 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -12,20 +12,36 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-class FCLstmFusePass : public Pass {
+// The MulLstmFusePass and MulLstmFusePass will fuse to the same FusionLstm op.
+
+// Just FC without bias
+class FCLstmFusePass : public FusePassBase {
  public:
   virtual ~FCLstmFusePass() {}
 
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  const std::string name_scope_{"fc_lstm_fuse"};
+};
+
+class MulLstmFusePass : public FusePassBase {
+ public:
+  virtual ~MulLstmFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"fc_nobias_lstm_fuse"};
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 55e495a0ed75c3a09703438dcfe01ca8f9d36118..ae8496204d4aeb88c04154d571325d440274e821 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -167,7 +167,6 @@ class Graph {
   std::map<std::string, std::function<void(void)>> attr_dels_;
   std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
   std::unordered_set<ir::Node *> node_set_;
-  int node_count_{0};
 };
 
 bool IsControlDepVar(const ir::Node &var);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 945ab110b148c320b6626cadaa47d483df68419e..fc7feca567e7a0f623ada77af189ef033b44fc53 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -19,7 +19,9 @@
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/printf.h"
 
 namespace paddle {
 namespace framework {
@@ -71,7 +73,10 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) {
 
 void GraphPatternDetector::operator()(Graph* graph,
                                       GraphPatternDetector::handle_t handler) {
-  if (!MarkPDNodesInGraph(*graph)) return;
+  if (!MarkPDNodesInGraph(*graph)) {
+    return;
+  }
+
   auto subgraphs = DetectPatterns();
   UniquePatterns(&subgraphs);
   RemoveOverlappedMatch(&subgraphs);
@@ -81,13 +86,13 @@ void GraphPatternDetector::operator()(Graph* graph,
   LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
   int id = 0;
   for (auto& g : subgraphs) {
-    LOG(INFO) << "optimizing #" << id++ << " subgraph";
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
     handler(g, graph);
   }
 }
 
 bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
-  VLOG(4) << "mark pdnodes in graph";
+  VLOG(3) << "mark pdnodes in graph";
   if (graph.Nodes().empty()) return false;
 
   for (auto& node : GraphTraits::DFS(graph)) {
@@ -102,11 +107,16 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
   for (auto& pdnode : pattern_.nodes()) {
     if (!pdnodes2nodes_.count(pdnode.get())) {
       VLOG(4) << pdnode->name() << " can't find matched Node, early stop";
-
-      return false;
+      // return false;
+    }
+  }
+  for (auto& item : pdnodes2nodes_) {
+    for (auto& n : item.second) {
+      GetMarkedNodes(const_cast<Graph*>(&graph)).insert(n);
     }
   }
   VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
+
   return !pdnodes2nodes_.empty();
 }
 
@@ -272,7 +282,7 @@ void GraphPatternDetector::RemoveOverlappedMatch(
   for (const auto& subgraph : *subgraphs) {
     bool valid = true;
     for (auto& item : subgraph) {
-      if (node_set.count(item.second)) {
+      if (item.first->IsIntermediate() && node_set.count(item.second)) {
         valid = false;
         break;
       }
@@ -328,22 +338,22 @@ PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) {
 }
 
 PDNode* PDNode::assert_is_op() {
-  asserts_.emplace_back([this](Node* x) { return x && x->IsOp(); });
+  asserts_.emplace_back([](Node* x) { return x && x->IsOp(); });
   return this;
 }
 PDNode* PDNode::assert_is_op(const std::string& op_type) {
-  asserts_.emplace_back([this, op_type](Node* x) {
+  asserts_.emplace_back([op_type](Node* x) {
     return x && x->IsOp() && x->Op()->Type() == op_type;
   });
   return this;
 }
 PDNode* PDNode::assert_is_var() {
-  asserts_.emplace_back([this](Node* x) { return x && x->IsVar(); });
+  asserts_.emplace_back([](Node* x) { return x && x->IsVar(); });
   return this;
 }
 PDNode* PDNode::assert_var_not_persistable() {
   assert_is_var();
-  asserts_.emplace_back([this](Node* x) { return !x->Var()->Persistable(); });
+  asserts_.emplace_back([](Node* x) { return !x->Var()->Persistable(); });
   return this;
 }
 PDNode* PDNode::assert_is_persistable_var() {
@@ -357,7 +367,9 @@ PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type,
   assert_is_op_input(op_type);
   asserts_.emplace_back([=](Node* x) {
     for (auto* op : x->outputs) {
-      if (IsNthInput(x, op, argument, nth)) return true;
+      if (op->IsOp() && op->Op()->Type() == op_type &&
+          IsNthInput(x, op, argument, nth))
+        return true;
     }
     return false;
   });
@@ -368,7 +380,9 @@ PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type,
   assert_is_var();
   asserts_.emplace_back([=](Node* x) {
     for (auto* op : x->inputs) {
-      if (IsNthOutput(x, op, argument, nth)) return true;
+      if (op->IsOp() && op->Op()->Type() == op_type &&
+          IsNthOutput(x, op, argument, nth))
+        return true;
     }
     return false;
   });
@@ -412,6 +426,12 @@ PDNode* PDNode::assert_is_op_output(const std::string& op_type) {
   });
   return this;
 }
+PDNode* PDNode::assert_is_op_output(const std::string& op_type,
+                                    const std::string& argument) {
+  assert_is_var();
+  assert_is_op_nth_output(op_type, argument, 0);
+  return this;
+}
 PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
   assert_is_var();
   asserts_.emplace_back([=](Node* x) {
@@ -424,6 +444,12 @@ PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
   });
   return this;
 }
+PDNode* PDNode::assert_is_op_input(const std::string& op_type,
+                                   const std::string& argument) {
+  assert_is_var();
+  assert_is_op_nth_input(op_type, argument, 0);
+  return this;
+}
 PDNode* PDNode::assert_op_has_n_inputs(const std::string& op_type, size_t n) {
   assert_is_op(op_type);
   asserts_.emplace_back([=](Node* x) { return x->inputs.size() == n; });
@@ -439,6 +465,151 @@ PDNode* PDNode::assert_more(PDNode::teller_t&& teller) {
   return this;
 }
 
+bool VarLinksToOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->outputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth) {
+  PADDLE_ENFORCE(var->IsVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (op->Op()->Input(argument).size() <= nth) return false;
+  return var->Name() == op->Op()->Input(argument)[nth];
+}
+bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth) {
+  PADDLE_ENFORCE(var->IsVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (op->Op()->Output(argument).size() <= nth) return false;
+  return var->Name() == op->Op()->Output(argument)[nth];
+}
+void GraphSafeRemoveNodes(Graph* graph,
+                          const std::unordered_set<const Node*>& nodes) {
+  for (auto* node : nodes) {
+    graph->RemoveNode(const_cast<Node*>(node));
+  }
+
+  for (auto* node : graph->Nodes()) {
+    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
+      if (nodes.count(*it)) {
+        it = const_cast<Node*>(node)->inputs.erase(it);
+      } else {
+        it++;
+      }
+    }
+    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
+      if (nodes.count(*it)) {
+        it = const_cast<Node*>(node)->outputs.erase(it);
+      } else {
+        it++;
+      }
+    }
+  }
+}
+bool VarLinksFromOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->inputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x,
+                                 bool with_bias) {
+  // Create shared nodes.
+  x->assert_is_op_input("mul", "X");
+  auto* mul = pattern->NewNode(mul_repr())->assert_is_op("mul");
+
+  auto* mul_w_var = pattern->NewNode(w_repr())
+                        ->AsInput()
+                        ->assert_is_persistable_var()
+                        ->assert_is_op_input("mul", "Y");
+
+  auto* mul_out_var =
+      pattern->NewNode(mul_out_repr())->assert_is_op_output("mul");
+
+  if (!with_bias) {  // not with bias
+    // Add links.
+    mul->LinksFrom({x, mul_w_var}).LinksTo({mul_out_var});
+    return mul_out_var;
+
+  } else {  // with bias
+    mul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+    // Create operators.
+    auto* elementwise_add = pattern->NewNode(elementwise_add_repr())
+                                ->assert_is_op("elementwise_add");
+    // Create variables.
+    auto* bias = pattern->NewNode(bias_repr())
+                     ->assert_is_op_input("elementwise_add")
+                     ->AsInput();
+
+    auto* fc_out = pattern->NewNode(Out_repr())
+                       ->AsOutput()
+                       ->assert_is_op_output("elementwise_add");
+
+    mul->LinksFrom({mul_w_var, x}).LinksTo({mul_out_var});
+    elementwise_add->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
+    return fc_out;
+  }
+}
+
+PDNode* patterns::LSTM::operator()(PDNode* x) {
+  x->assert_is_op_input("lstm", "Input");
+  auto* lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm");
+#define NEW_NODE(arg__, io__) \
+  auto* arg__ =               \
+      pattern->NewNode(arg__##_repr())->assert_is_op_##io__("lstm", #arg__);
+
+  // Currently, the H0 and C0 are optional
+  // TODO(Superjomn) upgrade the fuse framework to support optional.
+  // NEW_NODE(H0, input);
+  // NEW_NODE(C0, input);
+  NEW_NODE(Weight, input);
+  NEW_NODE(Bias, input);
+
+  NEW_NODE(Hidden, output);
+  NEW_NODE(Cell, output);
+  NEW_NODE(BatchGate, output);
+  NEW_NODE(BatchCellPreAct, output);
+#undef NEW_NODE
+
+  lstm_op->LinksFrom({x, Weight, Bias});
+  lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
+  return Hidden;
+}
+
+PDNode* patterns::GRU::operator()(PDNode* x) {
+  x->assert_is_op_input("gru", "Input");
+  auto* gru_op = pattern->NewNode(gru_repr())->assert_is_op("gru");
+#define NEW_NODE(arg__, io__) \
+  auto* arg__ =               \
+      pattern->NewNode(arg__##_repr())->assert_is_op_##io__("gru", #arg__);
+
+  NEW_NODE(Weight, input);
+  // TODO(Superjomn): upgrade the fuse framework to support optional.
+  // H0 and bias are optional
+  NEW_NODE(Bias, input);  // also optional
+  // NEW_NODE(H0, input);
+
+  NEW_NODE(Hidden, output);
+  // below are intermediate
+  NEW_NODE(BatchGate, output);
+  NEW_NODE(BatchResetHiddenPrev, output);
+  NEW_NODE(BatchHidden, output);
+#undef NEW_NODE
+
+  BatchGate->AsIntermediate();
+  BatchResetHiddenPrev->AsIntermediate();
+  BatchHidden->AsIntermediate();
+
+  gru_op->LinksFrom({x, Weight, Bias});
+  gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden});
+  return Hidden;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index f8488c84962d1caa6e7817b3c0349d6da3a59182..57482a07b607ba1d9fa06a5f325f60ba58dce307 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -19,6 +19,9 @@
 #endif
 
 #include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
@@ -95,7 +98,11 @@ struct PDNode {
   PDNode* assert_var_not_persistable();
   PDNode* assert_is_persistable_var();
   PDNode* assert_is_op_output(const std::string& op_type);
+  PDNode* assert_is_op_output(const std::string& op_type,
+                              const std::string& argument);
   PDNode* assert_is_op_input(const std::string& op_type);
+  PDNode* assert_is_op_input(const std::string& op_type,
+                             const std::string& argument);
   PDNode* assert_is_op_nth_input(const std::string& op_type,
                                  const std::string& argument, int nth);
   PDNode* assert_is_op_nth_output(const std::string& op_type,
@@ -167,6 +174,9 @@ class PDPattern {
 
   PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID());
   PDNode* NewNode(const std::string& name = NewID());
+  PDNode* NewNode(const std::string& prefix, const std::string& name) {
+    return NewNode(prefix + "/" + name);
+  }
   PDNode* RetrieveNode(const std::string& id) const;
 
   const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
@@ -238,6 +248,8 @@ class GraphPatternDetector {
   void UniquePatterns(std::vector<subgraph_t>* subgraphs);
 
   // Remove overlapped match subgraphs, when overlapped, keep the previous one.
+  // The intermediate PDNodes will be removed, so can't shared by multiple
+  // patterns.
   void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
 
   // Validate whether the intermediate nodes are linked by external nodes.
@@ -257,64 +269,168 @@ class GraphPatternDetector {
 
 // some helper methods.
 
-// Op's input.
-static bool VarLinksToOp(Node* node, const std::string& op_type) {
-  for (auto* out : node->outputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
+// Tell if a var links to an Op
+bool VarLinksToOp(Node* node, const std::string& op_type);
 
-// Op's output.
-static bool VarLinksFromOp(Node* node, const std::string& op_type) {
-  for (auto* out : node->inputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
+// Tell if an op links to a var
+bool VarLinksFromOp(Node* node, const std::string& op_type);
 
 // Check whether a var node is a op node's nth input.
-static bool IsNthInput(Node* var, Node* op, const std::string& argument,
-                       size_t nth) {
-  PADDLE_ENFORCE(var->IsVar());
-  PADDLE_ENFORCE(op->IsOp());
-  if (op->inputs.size() <= nth) return false;
-  return var->Name() == op->Op()->Input(argument)[nth];
-}
+bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth);
 
-static bool IsNthOutput(Node* var, Node* op, const std::string& argument,
-                        size_t nth) {
-  PADDLE_ENFORCE(var->IsVar());
-  PADDLE_ENFORCE(op->IsOp());
-  if (op->inputs.size() <= nth) return false;
-  return var->Name() == op->Op()->Output(argument)[nth];
-}
+// Tell whether a var node is a op node's nth output.
+bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth);
 
-static void GraphSafeRemoveNodes(Graph* graph,
-                                 const std::unordered_set<const Node*>& nodes) {
-  for (auto* node : nodes) {
-    graph->RemoveNode(const_cast<Node*>(node));
-  }
+// Graph safely remove some nodes, will automatically clean up the edges.
+void GraphSafeRemoveNodes(Graph* graph,
+                          const std::unordered_set<const Node*>& nodes);
 
-  for (auto* node : graph->Nodes()) {
-    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
-      if (nodes.count(*it)) {
-        it = const_cast<Node*>(node)->inputs.erase(it);
-      } else
-        it++;
-    }
-    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
-      if (nodes.count(*it)) {
-        it = const_cast<Node*>(node)->outputs.erase(it);
-      } else
-        it++;
-    }
+// Some pre-defined patterns those can be reused in multiple passes.
+// The related Fluid Layer or Op should be one pattern here for better reusage
+// accross different fusion.
+namespace patterns {
+
+struct KeyCounter {
+  static KeyCounter& Instance() {
+    static KeyCounter x;
+    return x;
   }
+
+  int IncCounter(const std::string& key) { return dic_[key]++; }
+
+ private:
+  std::unordered_map<std::string, size_t> dic_;
+};
+
+// Generate a unique PDNode's name with name_scope and id.
+// The format is {name_scope}/{repr}/{id}/{name}
+static std::string PDNodeName(const std::string& name_scope,
+                              const std::string& repr, size_t id,
+                              const std::string& name) {
+  return string::Sprintf("%s/%s/%d/%s", name_scope, repr, id, name);
+}
+// Generate a unique PDNode's name.
+// The format is {name_scope}/{repr}/{id}
+static std::string PDNodeName(const std::string& name_scope,
+                              const std::string& repr) {
+  return string::Sprintf("%s/%s/%d", name_scope, repr,
+                         KeyCounter::Instance().IncCounter(repr));
 }
+// Generate a unique key. It can be used for a universally unique temporary
+// name.
+// The format is {repr}/{id}
+static std::string UniqueKey(const std::string& repr) {
+  return string::Sprintf("%s/%d", repr,
+                         KeyCounter::Instance().IncCounter(repr));
+}
+
+// Declare a PDNode in a pattern, will create two methods:
+// std::string xxx_repr(); return this PDNode's string id.
+// PDNode* xxx_n(); return the corresponding PDNode.
+#define PATTERN_DECL_NODE(name__)                        \
+  std::string name__##_repr() const {                    \
+    return PDNodeName(name_scope_, repr_, id_, #name__); \
+  }                                                      \
+  PDNode* name__##_n() const { return pattern->RetrieveNode(name__##_repr()); }
+
+// Get an ir::Node* from the matched subgraph.
+// var: variable.
+// arg: the argument declared by PATTERN_DECL_NODE in a pattern definition.
+// pat: the pattern object.
+#define GET_IR_NODE_FROM_SUBGRAPH(var, arg, pat)                    \
+  PADDLE_ENFORCE(subgraph.count(pat.arg##_n()),                     \
+                 "Node not found for PDNode %s", pat.arg##_repr()); \
+  Node* var = subgraph.at(pat.arg##_n());                           \
+  PADDLE_ENFORCE(var, "node %s not exists in the sub-graph", #arg)
+
+// The base class of all the patterns.
+struct PatternBase {
+  PatternBase(PDPattern* pattern, const std::string& name_scope,
+              const std::string& repr)
+      : pattern(pattern),
+        name_scope_(name_scope),
+        repr_(repr),
+        id_(KeyCounter::Instance().IncCounter(repr)) {}
+
+  PDPattern* pattern;
+
+ protected:
+  std::string name_scope_;
+  std::string repr_;
+  size_t id_;
+};
+
+// FC with bias
+// op: mul + elementwise_add
+// named nodes:
+// mul, elementwise_add
+// w, mul_out, bias, fc_out
+struct FC : public PatternBase {
+  FC(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "fc") {}
+
+  PDNode* operator()(PDNode* x, bool with_bias);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(fc);
+  PATTERN_DECL_NODE(mul);
+  PATTERN_DECL_NODE(elementwise_add);
+  // declare variable node's name
+  PATTERN_DECL_NODE(w);
+  PATTERN_DECL_NODE(mul_out);  // (x,w) -> mul_out
+  PATTERN_DECL_NODE(bias);
+  PATTERN_DECL_NODE(Out);
+};
+
+struct LSTM : public PatternBase {
+  LSTM(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "lstm") {}
+
+  PDNode* operator()(PDNode* x);
+
+  // Operators
+  PATTERN_DECL_NODE(lstm);
+
+  // Inputs
+  PATTERN_DECL_NODE(Input);
+  PATTERN_DECL_NODE(H0);
+  PATTERN_DECL_NODE(C0);
+  PATTERN_DECL_NODE(Weight);
+  PATTERN_DECL_NODE(Bias);
+
+  // Outputs
+  PATTERN_DECL_NODE(Hidden);
+  PATTERN_DECL_NODE(Cell);
+  PATTERN_DECL_NODE(BatchGate);
+  PATTERN_DECL_NODE(BatchCellPreAct);
+};
+
+struct GRU : public PatternBase {
+  GRU(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "lstm") {}
+
+  PDNode* operator()(PDNode* x);
+
+  // Operators
+  PATTERN_DECL_NODE(gru);
+
+  // Inputs
+  PATTERN_DECL_NODE(Bias);
+  PATTERN_DECL_NODE(Weight);
+
+  // Outputs
+  PATTERN_DECL_NODE(BatchGate);
+  PATTERN_DECL_NODE(BatchResetHiddenPrev);
+  PATTERN_DECL_NODE(BatchHidden);
+  PATTERN_DECL_NODE(Hidden);
+};
+
+}  // namespace patterns
+
+// Link two ir::Nodes from each other.
+#define IR_NODE_LINK_TO(a, b) \
+  a->outputs.push_back(b);    \
+  b->inputs.push_back(a);
 
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
index 7e5c86b033a7c69a306491cf4bf8d099018c5f19..6c466fb21fb46e09961dc874e9e39655f83d17c6 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
@@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
         return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
       },
       "OP0");
-  auto* any_var = x.mutable_pattern()->NewNode(
-      [](Node* node) { return node->IsVar(); }, "VAR");
+  auto* any_var = x.mutable_pattern()
+                      ->NewNode([](Node* node) { return node->IsVar(); }, "VAR")
+                      ->AsIntermediate();
   auto* any_op1 = x.mutable_pattern()->NewNode(
       [](Node* node) { return node->IsOp(); }, "OP1");
 
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 4c7ffe69e933de3d52c8f762a1eeb73de17e0561..31ed98db72c8fd4af8c970861d386687962001ce 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -50,20 +50,37 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
 
   Dot dot;
 
-  std::vector<Dot::Attr> op_attrs({Dot::Attr("style", "filled"),
-                                   Dot::Attr("shape", "box"),
-                                   Dot::Attr("fillcolor", "red")});
-  std::vector<Dot::Attr> var_attrs({Dot::Attr("style", "filled,rounded"),
-                                    // Dot::Attr("shape", "diamond"),
-                                    Dot::Attr("fillcolor", "yellow")});
-
-  std::vector<Dot::Attr> marked_op_attrs({Dot::Attr("style", "filled"),
-                                          Dot::Attr("shape", "box"),
-                                          Dot::Attr("fillcolor", "lightgray")});
-  std::vector<Dot::Attr> marked_var_attrs(
-      {Dot::Attr("style", "filled,rounded"),
-       // Dot::Attr("shape", "diamond"),
-       Dot::Attr("fillcolor", "lightgray")});
+  const std::vector<Dot::Attr> op_attrs({
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("color", "#303A3A"),              //
+      Dot::Attr("fontcolor", "#ffffff"),          //
+      Dot::Attr("width", "1.3"),                  //
+      Dot::Attr("height", "0.84"),                //
+      Dot::Attr("fontname", "Arial"),             //
+  });
+  const std::vector<Dot::Attr> arg_attrs({
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("fontname", "Arial"),             //
+      Dot::Attr("fillcolor", "#999999"),          //
+      Dot::Attr("color", "#dddddd"),              //
+  });
+
+  const std::vector<Dot::Attr> param_attrs({
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("fontname", "Arial"),             //
+      Dot::Attr("color", "#148b97"),              //
+      Dot::Attr("fontcolor", "#ffffff"),          //
+  });
+
+  const std::vector<Dot::Attr> marked_op_attrs(
+      {Dot::Attr("style", "rounded,filled,bold"), Dot::Attr("shape", "box"),
+       Dot::Attr("fillcolor", "yellow")});
+  const std::vector<Dot::Attr> marked_var_attrs(
+      {Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"),
+       Dot::Attr("fillcolor", "yellow")});
 
   auto marked_nodes = ConsumeMarkedNodes(graph.get());
   // Create nodes
@@ -74,9 +91,17 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
           marked_nodes.count(n) ? marked_op_attrs : op_attrs;
       dot.AddNode(node_id, attr, node_id);
     } else if (n->IsVar()) {
-      decltype(op_attrs) attr =
-          marked_nodes.count(n) ? marked_var_attrs : var_attrs;
-      dot.AddNode(node_id, attr, node_id);
+      decltype(op_attrs)* attr;
+      if (marked_nodes.count(n)) {
+        attr = &marked_var_attrs;
+      } else if (const_cast<Node*>(n)->Var() &&
+                 const_cast<Node*>(n)->Var()->Persistable()) {
+        attr = &param_attrs;
+      } else {
+        attr = &arg_attrs;
+      }
+
+      dot.AddNode(node_id, *attr, node_id);
     }
     node2dot[n] = node_id;
   }
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
index 8d885cb9e4ee6e01de386b0f22423988dbe60ca6..e64916a5bb662e3b00cfe212f0bbbc537c7bc2cc 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -42,6 +42,13 @@ class GraphVizPass : public Pass {
   marked_nodes_t ConsumeMarkedNodes(Graph* graph) const;
 };
 
+static GraphVizPass::marked_nodes_t& GetMarkedNodes(Graph* graph) {
+  if (!graph->Has(kGraphvizMarkedNodeAttr)) {
+    graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t);
+  }
+  return graph->Get<GraphVizPass::marked_nodes_t>(kGraphvizMarkedNodeAttr);
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
index f885567da1965b997b2063e06c839af95b43e1e1..7713ed1eab88ee4fa16d52e7425075ae66f721a3 100644
--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
@@ -13,42 +13,41 @@
 // limitations under the License.
 
 #include <algorithm>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-class InferCleanGraphPass : public Pass {
+class InferCleanGraphPass : public FusePassBase {
  public:
   virtual ~InferCleanGraphPass() {}
 
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const {
+    FusePassBase::Init("original_graph", graph.get());
     PADDLE_ENFORCE(graph.get());
 
     auto is_valid_node = [](Node* x) {
       return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
     };
 
-    std::unordered_set<Node*> invalid_nodes;
+    std::unordered_set<const Node*> invalid_nodes;
+    int valid_op = 0;
     for (auto* node : graph->Nodes()) {
       if (is_valid_node(node)) {
         invalid_nodes.insert(node);
+      } else if (node->IsOp()) {
+        // Collect all the operators to help tracking number of operators.
+        ++valid_op;
       }
     }
 
-    // remove nodes from the graph.
-    for (auto* node : invalid_nodes) {
-      graph->RemoveNode(node);
-    }
+    GraphSafeRemoveNodes(graph.get(), invalid_nodes);
 
-    // clean edges.
-    for (auto* node : graph->Nodes()) {
-      CleanEdges(&node->inputs, invalid_nodes);
-      CleanEdges(&node->outputs, invalid_nodes);
-    }
+    AddStatis(valid_op);
 
     return graph;
   }
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
index a776a898a5ee13b4dde12460dce71433268fb9d4..a7d5161c35db804703415066990f34da8109fbd9 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -192,6 +192,8 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
   auto* id = subgraph.at(pattern.RetrieveNode(#id));        \
   PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
 
+  int fuse_count{0};
+
   detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph,
                             Graph* graph) {
     VLOG(4) << "get one concat pattern";
@@ -219,16 +221,13 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
     op_desc.SetAttr("fc_activation", act->Op()->Type());
 
     auto* op_node = graph->CreateOpNode(&op_desc);
-// Add links
-#define NODE_LINKS(a, b)   \
-  a->outputs.push_back(b); \
-  b->inputs.push_back(a);
-    NODE_LINKS(fc_w, op_node);
-    NODE_LINKS(fc_bias, op_node);
-    NODE_LINKS(concat_in0, op_node);
-    NODE_LINKS(sequence_expand0_in, op_node);
-    NODE_LINKS(sequence_expand1_in, op_node);
-    NODE_LINKS(op_node, fc_out);
+    // Add links
+    IR_NODE_LINK_TO(fc_w, op_node);
+    IR_NODE_LINK_TO(fc_bias, op_node);
+    IR_NODE_LINK_TO(concat_in0, op_node);
+    IR_NODE_LINK_TO(sequence_expand0_in, op_node);
+    IR_NODE_LINK_TO(sequence_expand1_in, op_node);
+    IR_NODE_LINK_TO(op_node, fc_out);
 
     // Clean nodes.
     std::unordered_set<const Node*> marked_nodes;
@@ -241,10 +240,13 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
     marked_nodes.erase(sequence_expand0_in);
     marked_nodes.erase(sequence_expand1_in);
     marked_nodes.erase(fc_out);
-
     GraphSafeRemoveNodes(graph, marked_nodes);
+
+    ++fuse_count;
   });
 
+  AddStatis(fuse_count);
+
   return graph;
 }
 
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index a4319ffabb04f39437b76d97845e021ef9de66d3..8c290bb095d554a973e66a3a19606a06759fd668 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -49,7 +49,7 @@ struct TensorCopyVisitor {
         size_(size) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     // TODO(Yancey1989): support other place
     platform::CPUPlace cpu;
     memory::Copy(cpu, dst_->mutable_data<T>(cpu) + dst_offset_, cpu,
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index ab693004cfb038fd92afd9c60e0fcb4e16b9f8a9..05c4a17a01c6fabe48f3fe18544c13153feb0673 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -149,7 +149,7 @@ struct AnyDTypeVisitor {
       : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     auto t = EigenVector<T>::Flatten(tensor_);
     auto o = EigenScalar<bool>::From(*out_);
     // return any of predicate_(t) is true.
@@ -302,7 +302,7 @@ struct DeserializedDataFunctor {
       : buf_(buf), tensor_(tensor), place_(place) {}
 
   template <typename T>
-  void operator()() {
+  void apply() {
     *buf_ = tensor_->mutable_data<T>(place_);
   }
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index a4f6364ae5b7d832096c92e9c6d8b3e865713cff..2006e3b24f71d0ae32b4e2ae34f1a1e4d3a82f91 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -10,19 +10,19 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
     SRCS io.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} graph_to_program_pass)
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 
 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 
 # paddle_fluid_origin exclude inference api interface
 cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 
-if(NOT APPLE)
+#if(APPLE)
   add_subdirectory(api)
-endif()
+#endif()
 
 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api)
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor)
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
@@ -32,6 +32,7 @@ endif()
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
     SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
     DEPS ${fluid_modules} paddle_fluid_api)
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 779ede5e460d0ceb6fd404c4a32374f9f9d92088..11a7509feb02a806e1e173bfb8bd7764f94d3457 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -6,6 +6,7 @@ cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits
   analyzer.cc
   helper.cc
   # passes
+  analysis_pass.cc
   fluid_to_data_flow_graph_pass.cc
   data_flow_graph_to_fluid_pass.cc
   dfg_graphviz_draw_pass.cc
@@ -25,61 +26,44 @@ function (inference_analysis_test TARGET)
     if(WITH_TESTING)
         set(options "")
         set(oneValueArgs "")
-        set(multiValueArgs SRCS EXTRA_DEPS)
+        set(multiValueArgs SRCS ARGS EXTRA_DEPS)
         cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
         set(mem_opt "")
         if(WITH_GPU)
             set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
         endif()
         cc_test(${TARGET}
                 SRCS "${analysis_test_SRCS}"
-                DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS}
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
+                DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
         set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
     endif(WITH_TESTING)
 endfunction(inference_analysis_test)
 
-set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz")
-set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz")
-set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE)
-set(DITU_RNN_MODEL ${DITU_INSTALL_DIR}/model)
-set(DITU_RNN_DATA ${DITU_INSTALL_DIR}/data.txt)
-
-function (inference_download_and_uncompress target url gz_filename)
+function (inference_download_and_uncompress install_dir url gz_filename)
     message(STATUS "Download inference test stuff ${gz_filename} from ${url}")
-    execute_process(COMMAND bash -c "mkdir -p ${DITU_INSTALL_DIR}")
-    execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && wget -q ${url}")
-    execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && tar xzf ${gz_filename}")
+    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}")
     message(STATUS "finish downloading ${gz_filename}")
 endfunction(inference_download_and_uncompress)
 
-if (NOT EXISTS ${DITU_INSTALL_DIR})
-    inference_download_and_uncompress(ditu_rnn_model ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz")
-    inference_download_and_uncompress(ditu_rnn_data ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz")
+set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz")
+set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz")
+set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1" CACHE PATH "RNN1 model and data root." FORCE)
+if (NOT EXISTS ${RNN1_INSTALL_DIR} AND WITH_TESTING)
+  inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} "rnn1%2Fmodel.tar.gz")
+  inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_DATA_URL} "rnn1%2Fdata.txt.tar.gz")
 endif()
 
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
-    analysis_predictor
-		# ir
-		fc_fuse_pass
-		fc_lstm_fuse_pass
-    seq_concat_fc_fuse_pass
-		graph_viz_pass
-		infer_clean_graph_pass
-		graph_pattern_detector
-    infer_clean_graph_pass
-    attention_lstm_fuse_pass
-    paddle_inference_api
-		pass
-    ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model
-        --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
-        --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
+    ARGS --infer_model=${RNN1_INSTALL_DIR}/model
+         --infer_data=${RNN1_INSTALL_DIR}/data.txt)
 
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
-inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc EXTRA_DEPS paddle_inference_api)
-inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc EXTRA_DEPS paddle_fluid)
+inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
+inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
 inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
 inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
 inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
@@ -87,3 +71,46 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
 inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
+
+set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
+set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
+set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE)
+if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
+  inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz")
+  inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
+endif()
+
+inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
+    ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
+        --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
+
+set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
+set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
+set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE)
+if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
+    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz")
+    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz")
+endif()
+
+inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
+    ARGS --infer_model=${LAC_INSTALL_DIR}/model
+        --infer_data=${LAC_INSTALL_DIR}/data.txt)
+
+
+set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
+set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
+set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE)
+
+if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
+  inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
+  inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz")
+endif()
+
+inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
+    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
+         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
+         --topn=1 # Just run top 1 batch.
+    )
diff --git a/paddle/fluid/inference/analysis/pass.cc b/paddle/fluid/inference/analysis/analysis_pass.cc
similarity index 91%
rename from paddle/fluid/inference/analysis/pass.cc
rename to paddle/fluid/inference/analysis/analysis_pass.cc
index 121b72c0a0aa9a0c568b04f7ee9a5bc5c1d6f5f8..9be9f755b9ed7273d842f8c0e2046f0ca0ce2247 100644
--- a/paddle/fluid/inference/analysis/pass.cc
+++ b/paddle/fluid/inference/analysis/analysis_pass.cc
@@ -12,4 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
similarity index 59%
rename from paddle/fluid/inference/analysis/pass.h
rename to paddle/fluid/inference/analysis/analysis_pass.h
index 7719c6f5ff3c940948c7bdbcb25513cdf430281b..b6edb5529ace2ad5bd1b35bfbee1f7a744457cc3 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -28,10 +28,10 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-class Pass {
+class AnalysisPass {
  public:
-  Pass() = default;
-  virtual ~Pass() = default;
+  AnalysisPass() = default;
+  virtual ~AnalysisPass() = default;
   // Mutable Pass.
   virtual bool Initialize(Argument *argument) { return false; }
   // Readonly Pass.
@@ -42,23 +42,16 @@ class Pass {
   virtual bool Finalize() { return false; }
 
   // Get a Pass appropriate to print the Node this pass operates on.
-  virtual Pass *CreatePrinterPass(std::ostream &os,
-                                  const std::string &banner) const {
+  virtual AnalysisPass *CreatePrinterPass(std::ostream &os,
+                                          const std::string &banner) const {
     return nullptr;
   }
 
   // Create a debugger Pass that draw the DFG by graphviz toolkit.
-  virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
+  virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }
 
-  virtual void Run() { LOG(FATAL) << "not valid"; }
-  // Run on a single Node.
-  virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
-  // Run on a single Function.
-  virtual void Run(Function *x) { LOG(FATAL) << "not valid"; }
-  // Run on a single FunctionBlock.
-  virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
   // Run on a single DataFlowGraph.
-  virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
+  virtual void Run(DataFlowGraph *x) = 0;
 
   // Human-readable short representation.
   virtual std::string repr() const = 0;
@@ -66,29 +59,8 @@ class Pass {
   virtual std::string description() const { return "No DOC"; }
 };
 
-// NodePass process on any Node types.
-class NodePass : public Pass {
- public:
-  virtual void Run(Node *node) = 0;
-};
-
-// NodePass process on any Function node types.
-class FunctionPass : public Pass {
- public:
-  virtual void Run(Function *node) = 0;
-};
-
-// NodePass process on any FunctionBlock node types.
-class FunctionBlockPass : public Pass {
- public:
-  virtual void Run(FunctionBlock *node) = 0;
-};
-
 // GraphPass processes on any GraphType.
-class DataFlowGraphPass : public Pass {
- public:
-  virtual void Run(DataFlowGraph *graph) = 0;
-};
+class DataFlowGraphPass : public AnalysisPass {};
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index e6e63544ffa2de09e39b02769aaaf0793d6b1111..6dc39cae0522efd48c2e2921611adebd6937ddf7 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
+#include <vector>
+
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
@@ -41,27 +43,23 @@ class DfgPassManagerImpl final : public DfgPassManager {
  public:
   DfgPassManagerImpl() {
     // TODO(Superjomn) set the key with pass reprs.
-    LOG(INFO)
-        << "-----------------------------------------------------------------";
-    if (FLAGS_IA_enable_ir) {
-      AddPass("fluid-to-ir-pass", new FluidToIrPass);
-    } else {
+    if (!FLAGS_IA_enable_ir) {
       AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    } else {
+      AddPass("fluid-to-ir-pass", new FluidToIrPass);
     }
     TryAddTensorRtPass();
     AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
     if (!FLAGS_IA_output_storage_path.empty()) {
       AddPass("model-store-pass", new ModelStorePass);
     }
-    LOG(INFO)
-        << "-----------------------------------------------------------------";
   }
 
   std::string repr() const override { return "dfg-pass-manager"; }
   std::string description() const override { return "DFG pass manager."; }
 
  private:
-  void AddPass(const std::string& name, Pass* pass) {
+  void AddPass(const std::string& name, AnalysisPass* pass) {
     VLOG(3) << "Adding pass " << name;
     Register(name, pass);
     AddGraphvizDebugerPass(pass);
@@ -90,7 +88,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
   }
 
   // Add the graphviz debuger pass if the parent pass has one.
-  void AddGraphvizDebugerPass(Pass* pass) {
+  void AddGraphvizDebugerPass(AnalysisPass* pass) {
     auto* debuger_pass = pass->CreateGraphvizDebugerPass();
     if (debuger_pass) {
       Register(debuger_pass->repr(), debuger_pass);
@@ -101,18 +99,15 @@ class DfgPassManagerImpl final : public DfgPassManager {
 Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
 
 void Analyzer::Run(Argument* argument) {
-  // Ugly support fluid-to-ir-pass
-  argument->Set(kFluidToIrPassesAttr,
-                new std::vector<std::string>({
-                    // Manual update the passes here.
-                    "graph_viz_pass",                              //
-                    "infer_clean_graph_pass", "graph_viz_pass",    //
-                    "attention_lstm_fuse_pass", "graph_viz_pass",  //
-                    "fc_lstm_fuse_pass", "graph_viz_pass",         //
-                    "seq_concat_fc_fuse_pass", "graph_viz_pass",   //
-                    "fc_fuse_pass", "graph_viz_pass"               //
-
-                }));
+  std::vector<std::string> passes;
+  for (auto& pass : all_ir_passes_) {
+    if (!disabled_ir_passes_.count(pass)) {
+      passes.push_back(pass);
+      passes.push_back("graph_viz_pass");  // add graphviz for debug.
+    }
+  }
+  passes.push_back("graph_viz_pass");
+  argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));
 
   for (auto& x : data_) {
     PADDLE_ENFORCE(x->Initialize(argument));
@@ -121,6 +116,11 @@ void Analyzer::Run(Argument* argument) {
   }
 }
 
+Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
+  disabled_ir_passes_.insert(passes.begin(), passes.end());
+  return *this;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 2e107c82dd50d5cf22797f4c82e69d302514f955..399afbe64a56393176795ecdd1ac70bfedd5c91a 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -36,16 +36,12 @@ limitations under the License. */
  */
 
 #include <gflags/gflags.h>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 
-// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
-// flag if not available.
-DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
-DECLARE_string(IA_graphviz_log_root);
-DECLARE_string(IA_output_storage_path);
-DECLARE_bool(IA_enable_ir);
-
 namespace paddle {
 namespace inference {
 namespace analysis {
@@ -57,7 +53,28 @@ class Analyzer : public OrderedRegistry<PassManager> {
 
   void Run(Argument* argument);
 
+  Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
+
   DISABLE_COPY_AND_ASSIGN(Analyzer);
+
+ private:
+  // All avaiable IR passes.
+  // The bigger fuse comes first, so that the small operators prefer to be
+  // merged in a larger fuse op. The small fusion will not break the pattern of
+  // larger fusion.
+  const std::vector<std::string> all_ir_passes_{{
+      // Manual update the passes here.
+      "infer_clean_graph_pass",    //
+      "attention_lstm_fuse_pass",  //
+      "fc_lstm_fuse_pass",         //
+      "mul_lstm_fuse_pass",        //
+      "fc_gru_fuse_pass",          //
+      "mul_gru_fuse_pass",         //
+      "seq_concat_fc_fuse_pass",   //
+      "fc_fuse_pass",              //
+  }};
+
+  std::unordered_set<std::string> disabled_ir_passes_;
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..522d870db8583aac4006e8cdb7909625c3feb34b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -0,0 +1,273 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_string(infer_model, "", "model path for LAC");
+DEFINE_string(infer_data, "", "data file for LAC");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(burning, 0, "Burning before repeat.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct DataRecord {
+  std::vector<int64_t> data;
+  std::vector<size_t> lod;
+  // for dataset and nextbatch
+  size_t batch_iter{0};
+  std::vector<std::vector<size_t>> batched_lods;
+  std::vector<std::vector<int64_t>> batched_datas;
+  std::vector<std::vector<int64_t>> datasets;
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1) {
+    Load(path);
+    Prepare(batch_size);
+    batch_iter = 0;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    datasets.resize(0);
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ';', &data);
+      std::vector<int64_t> words_ids;
+      split_to_int64(data[1], ' ', &words_ids);
+      datasets.emplace_back(words_ids);
+    }
+  }
+  void Prepare(int bs) {
+    if (bs == 1) {
+      batched_datas = datasets;
+      for (auto one_sentence : datasets) {
+        batched_lods.push_back({0, one_sentence.size()});
+      }
+    } else {
+      std::vector<int64_t> one_batch;
+      std::vector<size_t> lod{0};
+      int bs_id = 0;
+      for (auto one_sentence : datasets) {
+        bs_id++;
+        one_batch.insert(one_batch.end(), one_sentence.begin(),
+                         one_sentence.end());
+        lod.push_back(lod.back() + one_sentence.size());
+        if (bs_id == bs) {
+          bs_id = 0;
+          batched_datas.push_back(one_batch);
+          batched_lods.push_back(lod);
+          one_batch.clear();
+          one_batch.resize(0);
+          lod.clear();
+          lod.resize(0);
+          lod.push_back(0);
+        }
+      }
+      if (one_batch.size() != 0) {
+        batched_datas.push_back(one_batch);
+        batched_lods.push_back(lod);
+      }
+    }
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    data.data = batched_datas[batch_iter];
+    data.lod = batched_lods[batch_iter];
+    batch_iter++;
+    if (batch_iter >= batched_datas.size()) {
+      batch_iter = 0;
+    }
+    return data;
+  }
+};
+
+void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                 int batch_size) {
+  auto one_batch = data->NextBatch();
+  PaddleTensor input_tensor;
+  input_tensor.name = "word";
+  input_tensor.shape.assign({static_cast<int>(one_batch.data.size()), 1});
+  input_tensor.lod.assign({one_batch.lod});
+  input_tensor.dtype = PaddleDType::INT64;
+  TensorAssignData<int64_t>(&input_tensor, {one_batch.data});
+  PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
+  input_slots->assign({input_tensor});
+}
+
+void BenchAllData(const std::string &model_path, const std::string &data_file,
+                  const int batch_size, const int repeat) {
+  NativeConfig config;
+  config.model_dir = model_path;
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+  std::vector<PaddleTensor> input_slots, outputs_slots;
+  DataRecord data(data_file, batch_size);
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  GetOneBatch(&input_slots, &data, batch_size);
+  for (int i = 0; i < FLAGS_burning; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  Timer timer;
+  double sum = 0;
+  for (int i = 0; i < repeat; i++) {
+    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
+      GetOneBatch(&input_slots, &data, batch_size);
+      timer.tic();
+      predictor->Run(input_slots, &outputs_slots);
+      sum += timer.toc();
+    }
+  }
+  PrintTime(batch_size, repeat, 1, 0, sum / repeat);
+}
+
+const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
+                                25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
+                                44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
+                                14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
+
+void TestLACPrediction(const std::string &model_path,
+                       const std::string &data_file, const int batch_size,
+                       const int repeat, bool test_all_data,
+                       bool use_analysis = false) {
+  NativeConfig config;
+  config.model_dir = model_path;
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+  std::vector<PaddleTensor> input_slots, outputs_slots;
+  DataRecord data(data_file, batch_size);
+  GetOneBatch(&input_slots, &data, batch_size);
+  std::unique_ptr<PaddlePredictor> predictor;
+  if (use_analysis) {
+    AnalysisConfig cfg;
+    cfg.model_dir = model_path;
+    cfg.use_gpu = false;
+    cfg.device = 0;
+    cfg.specify_input_name = true;
+    cfg.enable_ir_optim = true;
+    predictor =
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+  } else {
+    predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  }
+  for (int i = 0; i < FLAGS_burning; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  Timer timer;
+  if (test_all_data) {
+    double sum = 0;
+    LOG(INFO) << "Total number of samples: " << data.datasets.size();
+    for (int i = 0; i < repeat; i++) {
+      for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
+        GetOneBatch(&input_slots, &data, batch_size);
+        timer.tic();
+        predictor->Run(input_slots, &outputs_slots);
+        sum += timer.toc();
+      }
+    }
+    PrintTime(batch_size, repeat, 1, 0, sum / repeat);
+    LOG(INFO) << "Average latency of each sample: "
+              << sum / repeat / data.datasets.size() << " ms";
+    return;
+  }
+  timer.tic();
+  for (int i = 0; i < repeat; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  PrintTime(batch_size, repeat, 1, 0, timer.toc() / repeat);
+
+  // check result
+  EXPECT_EQ(outputs_slots.size(), 1UL);
+  auto &out = outputs_slots[0];
+  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                [](int a, int b) { return a * b; });
+  size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
+  PADDLE_ENFORCE_GT(size, 0);
+  EXPECT_GE(size, batch1_size);
+  int64_t *pdata = static_cast<int64_t *>(out.data.data());
+  for (size_t i = 0; i < batch1_size; ++i) {
+    EXPECT_EQ(pdata[i], lac_ref_data[i]);
+  }
+
+  if (use_analysis) {
+    // run once for comparion as reference
+    auto ref_predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+    std::vector<PaddleTensor> ref_outputs_slots;
+    ref_predictor->Run(input_slots, &ref_outputs_slots);
+    EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
+    auto &ref_out = ref_outputs_slots[0];
+    size_t ref_size =
+        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
+                        [](int a, int b) { return a * b; });
+    EXPECT_EQ(size, ref_size);
+    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
+    for (size_t i = 0; i < size; ++i) {
+      EXPECT_EQ(pdata_ref[i], pdata[i]);
+    }
+
+    AnalysisPredictor *analysis_predictor =
+        dynamic_cast<AnalysisPredictor *>(predictor.get());
+    auto &fuse_statis = analysis_predictor->analysis_argument()
+                            .Get<std::unordered_map<std::string, int>>(
+                                framework::ir::kFuseStatisAttr);
+    for (auto &item : fuse_statis) {
+      LOG(INFO) << "fused " << item.first << " " << item.second;
+    }
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
+    EXPECT_EQ(num_ops, 11);
+  }
+}
+
+TEST(Analyzer_LAC, native) {
+  LOG(INFO) << "LAC with native";
+  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
+                    FLAGS_repeat, FLAGS_test_all_data);
+}
+
+TEST(Analyzer_LAC, analysis) {
+  LOG(INFO) << "LAC with analysis";
+  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
+                    FLAGS_repeat, FLAGS_test_all_data, true);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..661b047ed7cb70545267e468d8c2c48596a2994c
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
@@ -0,0 +1,227 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_string(infer_model, "", "model path");
+DEFINE_string(infer_data, "", "data path");
+DEFINE_int32(batch_size, 10, "batch size.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+
+namespace paddle {
+namespace inference {
+
+struct DataRecord {
+  std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
+  std::vector<std::vector<int64_t>> rnn_word_datas, rnn_mention_datas;
+  std::vector<size_t> lod;  // two inputs have the same lod info.
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  size_t num_samples;  // total number of samples
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= word_data_all.size()) {
+      data.word_data_all.assign(word_data_all.begin() + batch_iter,
+                                word_data_all.begin() + batch_end);
+      data.mention_data_all.assign(mention_data_all.begin() + batch_iter,
+                                   mention_data_all.begin() + batch_end);
+      // Prepare LoDs
+      data.lod.push_back(0);
+      CHECK(!data.word_data_all.empty());
+      CHECK(!data.mention_data_all.empty());
+      CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size());
+      for (size_t j = 0; j < data.word_data_all.size(); j++) {
+        data.rnn_word_datas.push_back(data.word_data_all[j]);
+        data.rnn_mention_datas.push_back(data.mention_data_all[j]);
+        // calculate lod
+        data.lod.push_back(data.lod.back() + data.word_data_all[j].size());
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ';', &data);
+      // load word data
+      std::vector<int64_t> word_data;
+      split_to_int64(data[1], ' ', &word_data);
+      // load mention data
+      std::vector<int64_t> mention_data;
+      split_to_int64(data[3], ' ', &mention_data);
+      word_data_all.push_back(std::move(word_data));
+      mention_data_all.push_back(std::move(mention_data));
+    }
+    num_samples = num_lines;
+  }
+};
+
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor lod_word_tensor, lod_mention_tensor;
+  lod_word_tensor.name = "word";
+  lod_mention_tensor.name = "mention";
+  auto one_batch = data->NextBatch();
+  int size = one_batch.lod[one_batch.lod.size() - 1];  // token batch size
+  lod_word_tensor.shape.assign({size, 1});
+  lod_word_tensor.lod.assign({one_batch.lod});
+  lod_mention_tensor.shape.assign({size, 1});
+  lod_mention_tensor.lod.assign({one_batch.lod});
+  // assign data
+  TensorAssignData<int64_t>(&lod_word_tensor, one_batch.rnn_word_datas);
+  TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.rnn_mention_datas);
+  // Set inputs.
+  input_slots->assign({lod_word_tensor, lod_mention_tensor});
+  for (auto &tensor : *input_slots) {
+    tensor.dtype = PaddleDType::INT64;
+  }
+}
+
+// the first inference result
+const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
+                                       48, 39, 38, 16, 25};
+
+void TestChineseNERPrediction(bool use_analysis) {
+  NativeConfig config;
+  config.prog_file = FLAGS_infer_model + "/__model__";
+  config.param_file = FLAGS_infer_model + "/param";
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+
+  std::vector<PaddleTensor> input_slots, outputs;
+  std::unique_ptr<PaddlePredictor> predictor;
+  Timer timer;
+  if (use_analysis) {
+    AnalysisConfig cfg;
+    cfg.prog_file = FLAGS_infer_model + "/__model__";
+    cfg.param_file = FLAGS_infer_model + "/param";
+    cfg.use_gpu = false;
+    cfg.device = 0;
+    cfg.specify_input_name = true;
+    cfg.enable_ir_optim = true;
+    predictor =
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+  } else {
+    predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  }
+
+  if (FLAGS_test_all_data) {
+    LOG(INFO) << "test all data";
+    double sum = 0;
+    size_t num_samples;
+    for (int i = 0; i < FLAGS_repeat; i++) {
+      DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+      num_samples = data.num_samples;
+      for (size_t bid = 0; bid < num_samples; ++bid) {
+        PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+        timer.tic();
+        predictor->Run(input_slots, &outputs);
+        sum += timer.toc();
+      }
+    }
+    LOG(INFO) << "total number of samples: " << num_samples;
+    PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
+    LOG(INFO) << "average latency of each sample: "
+              << sum / FLAGS_repeat / num_samples;
+    return;
+  }
+  // Prepare inputs.
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+
+  timer.tic();
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    predictor->Run(input_slots, &outputs);
+  }
+  PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, timer.toc() / FLAGS_repeat);
+
+  PADDLE_ENFORCE(outputs.size(), 1UL);
+  auto &out = outputs[0];
+  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                [](int a, int b) { return a * b; });
+  PADDLE_ENFORCE_GT(size, 0);
+  int64_t *result = static_cast<int64_t *>(out.data.data());
+  for (size_t i = 0; i < std::min(11UL, size); i++) {
+    PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]);
+  }
+
+  if (use_analysis) {
+    // run once for comparion as reference
+    auto ref_predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+    std::vector<PaddleTensor> ref_outputs_slots;
+    ref_predictor->Run(input_slots, &ref_outputs_slots);
+    EXPECT_EQ(ref_outputs_slots.size(), outputs.size());
+    auto &ref_out = ref_outputs_slots[0];
+    size_t ref_size =
+        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
+                        [](int a, int b) { return a * b; });
+    EXPECT_EQ(size, ref_size);
+    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
+    for (size_t i = 0; i < size; ++i) {
+      EXPECT_EQ(pdata_ref[i], result[i]);
+    }
+
+    AnalysisPredictor *analysis_predictor =
+        dynamic_cast<AnalysisPredictor *>(predictor.get());
+    auto &fuse_statis = analysis_predictor->analysis_argument()
+                            .Get<std::unordered_map<std::string, int>>(
+                                framework::ir::kFuseStatisAttr);
+    for (auto &item : fuse_statis) {
+      LOG(INFO) << "fused " << item.first << " " << item.second;
+    }
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
+    EXPECT_EQ(num_ops, 14);
+  }
+}
+
+TEST(Analyzer_Chinese_ner, native) { TestChineseNERPrediction(false); }
+
+TEST(Analyzer_Chinese_ner, analysis) { TestChineseNERPrediction(true); }
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 2cc83c777ce7a92d4e00a95d2d2c40753e2ffbcf..cc4b390495c60cecc8ebebb3f17b38d9b5a15956 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -16,25 +16,27 @@
 
 #include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
+#include <thread>  // NOLINT
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/platform/profiler.h"
 
-DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
-DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
+DEFINE_string(infer_model, "", "model path");
+DEFINE_string(infer_data, "", "data path");
 DEFINE_int32(batch_size, 10, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-using namespace framework;
+using namespace framework;  // NOLINT
 
 TEST(Analyzer, analysis_without_tensorrt) {
   FLAGS_IA_enable_tensorrt_subgraph_engine = false;
@@ -201,13 +203,13 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   minute_tensor.lod.assign({one_batch.lod3});
   // clang-format on
   // assign data
-  TensorAssignData(&lod_attention_tensor,
-                   std::vector<std::vector<float>>({{0, 0}}));
+  TensorAssignData<float>(&lod_attention_tensor,
+                          std::vector<std::vector<float>>({{0, 0}}));
   std::vector<float> tmp_zeros(batch_size * 15, 0.);
-  TensorAssignData(&init_zero_tensor, {tmp_zeros});
-  TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data);
-  TensorAssignData(&week_tensor, one_batch.rnn_week_datas);
-  TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas);
+  TensorAssignData<float>(&init_zero_tensor, {tmp_zeros});
+  TensorAssignData<float>(&lod_tensor_tensor, one_batch.rnn_link_data);
+  TensorAssignData<float>(&week_tensor, one_batch.rnn_week_datas);
+  TensorAssignData<float>(&minute_tensor, one_batch.rnn_minute_datas);
   // Set inputs.
   auto init_zero_tensor1 = init_zero_tensor;
   init_zero_tensor1.name = "hidden_init";
@@ -219,101 +221,95 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   }
 }
 
-std::string DescribeTensor(const PaddleTensor &tensor) {
-  std::stringstream os;
-  os << "Tensor [" << tensor.name << "]\n";
-  os << " - type: ";
-  switch (tensor.dtype) {
-    case PaddleDType::FLOAT32:
-      os << "float32";
-      break;
-    case PaddleDType::INT64:
-      os << "int64";
-      break;
-    default:
-      os << "unset";
-  }
-  os << '\n';
-
-  os << " - shape: " << to_string(tensor.shape) << '\n';
-  os << " - lod: ";
-  for (auto &l : tensor.lod) {
-    os << to_string(l) << "; ";
-  }
-  os << "\n";
-  os << " - data: ";
+}  // namespace
 
-  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
-                            [](int a, int b) { return a * b; });
-  for (int i = 0; i < dim; i++) {
-    os << static_cast<float *>(tensor.data.data())[i] << " ";
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<PaddleTensor> &base_outputs) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &base_out = base_outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
+                                   1, [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_EQ(size, size1);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *data = static_cast<float *>(out.data.data());
+    float *base_data = static_cast<float *>(base_out.data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+    }
   }
-  os << '\n';
-  return os.str();
 }
-
-}  // namespace
-
-const float ditu_rnn_target_data[] = {
-    104.711, 11.2431, 1.35422, 0,       0,       0,       0,       0,
-    27.7039, 1.41486, 7.09526, 0,       0,       0,       0,       0,
-    7.6481,  6.5324,  56.383,  2.88018, 8.92918, 132.007, 4.27429, 2.02934,
-    14.1727, 10.7461, 25.0616, 16.0197, 14.4163, 16.9199, 6.75517, 0,
-    80.0249, 4.77739, 0,       0,       0,       0,       0,       0,
-    47.5643, 2.67029, 8.76252, 0,       0,       0,       0,       0,
-    51.8822, 4.4411,  0,       0,       0,       0,       0,       0,
-    10.7286, 12.0595, 10.6672, 0,       0,       0,       0,       0,
-    93.5771, 3.84641, 0,       0,       0,       0,       0,       0,
-    169.426, 0,       0,       0,       0,       0,       0,       0};
 // Test with a really complicate model.
-void TestDituRNNPrediction(const std::string &model_path,
-                           const std::string &data_path, int batch_size,
-                           bool use_analysis, bool activate_ir,
-                           int num_times = 1) {
-  NativeConfig config;
-  config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
-  config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
+void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
+  AnalysisConfig config;
+  config.prog_file = FLAGS_infer_model + "/__model__";
+  config.param_file = FLAGS_infer_model + "/param";
   config.use_gpu = false;
   config.device = 0;
   config.specify_input_name = true;
+  config.enable_ir_optim = activate_ir;
+  PADDLE_ENFORCE(config.ir_mode ==
+                 AnalysisConfig::IrPassMode::kExclude);  // default
+  config.ir_passes.clear();  // Do not exclude any pass.
+
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
 
   auto base_predictor =
       CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
   auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(config);
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
   std::vector<PaddleTensor> input_slots;
-  DataRecord data(data_path, batch_size);
+  DataRecord data(FLAGS_infer_data, batch_size);
   // Prepare inputs.
   PrepareInputs(&input_slots, &data, batch_size);
   std::vector<PaddleTensor> outputs, base_outputs;
 
   base_predictor->Run(input_slots, &base_outputs);
 
-  Timer timer;
-  timer.tic();
-  for (int i = 0; i < num_times; i++) {
-    predictor->Run(input_slots, &outputs);
-  }
-  LOG(INFO) << "===========profile result===========";
-  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
-            << ", latency: " << timer.toc() / num_times << "ms";
-  LOG(INFO) << "=====================================";
-
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    auto &base_out = base_outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
-                                   1, [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_EQ(size, size1);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    float *base_data = static_cast<float *>(base_out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+  if (num_threads == 1) {
+    // Prepare inputs.
+    Timer timer;
+    timer.tic();
+    for (int i = 0; i < num_times; i++) {
+      predictor->Run(input_slots, &outputs);
+    }
+    PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
+    CompareResult(outputs, base_outputs);
+  } else {
+    std::vector<std::thread> threads;
+    std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+    // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
+    // because AttentionLSTM's hard code nodeid will be damanged.
+    for (int tid = 0; tid < num_threads; ++tid) {
+      predictors.emplace_back(
+          CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+              config));
+    }
+    for (int tid = 0; tid < num_threads; ++tid) {
+      threads.emplace_back([&, tid]() {
+        // Each thread should have local input_slots and outputs.
+        std::vector<PaddleTensor> input_slots;
+        DataRecord data(FLAGS_infer_data, batch_size);
+        PrepareInputs(&input_slots, &data, batch_size);
+        std::vector<PaddleTensor> outputs;
+        Timer timer;
+        timer.tic();
+        for (int i = 0; i < num_times; i++) {
+          predictors[tid]->Run(input_slots, &outputs);
+        }
+        PrintTime(batch_size, num_times, num_threads, tid,
+                  timer.toc() / num_times);
+        CompareResult(outputs, base_outputs);
+      });
+    }
+    for (int i = 0; i < num_threads; ++i) {
+      threads[i].join();
     }
   }
 
@@ -327,39 +323,44 @@ void TestDituRNNPrediction(const std::string &model_path,
       LOG(INFO) << "fused " << item.first << " " << item.second;
     }
 
-    ASSERT_TRUE(fuse_statis.count("fc"));
-    EXPECT_EQ(fuse_statis.at("fc"), 1);
-  }
-}
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
 
-// Directly infer with the original model.
-TEST(Analyzer, DituRNN_without_analysis) {
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, false, false, FLAGS_repeat);
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
+    EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
+    EXPECT_EQ(num_ops,
+              13);  // After graph optimization, only 13 operators exists.
+  }
 }
 
-// Inference with the original model with the analysis turned on, the analysis
-// module will transform the program to a data flow graph.
-TEST(Analyzer, DituRNN_with_analysis) {
-  LOG(INFO) << "ditu rnn with analysis";
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, true, false, FLAGS_repeat);
-}
+// Inference with analysis and IR, easy for profiling independently.
+TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); }
 
-// Inference with analysis and IR. The IR module will fuse some large kernels.
-TEST(Analyzer, DituRNN_with_analysis_with_IR) {
-  LOG(INFO) << "ditu rnn with analysis and IR fuse";
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, true, true, FLAGS_repeat);
+// Other unit-tests of RNN1, test different options of use_analysis,
+// activate_ir and multi-threads.
+TEST(Analyzer, RNN_tests) {
+  int num_threads[2] = {1, 4};
+  for (auto i : num_threads) {
+    // Directly infer with the original model.
+    TestRNN1Prediction(false, false, i);
+    // Inference with the original model with the analysis turned on, the
+    // analysis
+    // module will transform the program to a data flow graph.
+    TestRNN1Prediction(true, false, i);
+    // Inference with analysis and IR. The IR module will fuse some large
+    // kernels.
+    TestRNN1Prediction(true, true, i);
+  }
 }
 
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
-
-USE_PASS(fc_fuse_pass);
-USE_PASS(seq_concat_fc_fuse_pass);
-USE_PASS(fc_lstm_fuse_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
-USE_PASS(attention_lstm_fuse_pass);
diff --git a/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc b/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..65169f8cfcc5bf1e989609666f6e0ba03e42e5ba
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
+#include <gtest/gtest.h>
+#include <fstream>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/api/timer.h"
+
+DEFINE_string(infer_model, "", "Directory of the inference model.");
+DEFINE_string(infer_data, "", "Path of the dataset.");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(repeat, 1, "How many times to repeat run.");
+DEFINE_int32(topn, -1, "Run top n batches of data to save time");
+
+namespace paddle {
+namespace inference {
+
+struct DataReader {
+  explicit DataReader(const std::string &path)
+      : file(new std::ifstream(path)) {}
+
+  bool NextBatch(PaddleTensor *tensor, int batch_size) {
+    PADDLE_ENFORCE_EQ(batch_size, 1);
+    std::string line;
+    tensor->lod.clear();
+    tensor->lod.emplace_back(std::vector<size_t>({0}));
+    std::vector<int64_t> data;
+
+    for (int i = 0; i < batch_size; i++) {
+      if (!std::getline(*file, line)) return false;
+      inference::split_to_int64(line, ' ', &data);
+    }
+    tensor->lod.front().push_back(data.size());
+
+    tensor->data.Resize(data.size() * sizeof(int64_t));
+    memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t));
+    tensor->shape.clear();
+    tensor->shape.push_back(data.size());
+    tensor->shape.push_back(1);
+    return true;
+  }
+
+  std::unique_ptr<std::ifstream> file;
+};
+
+void Main(int batch_size) {
+  // shape --
+  // Create Predictor --
+  AnalysisConfig config;
+  config.model_dir = FLAGS_infer_model;
+  config.use_gpu = false;
+  config.enable_ir_optim = true;
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+
+  std::vector<PaddleTensor> input_slots(1);
+  // one batch starts
+  // data --
+  auto &input = input_slots[0];
+  input.dtype = PaddleDType::INT64;
+
+  inference::Timer timer;
+  double sum = 0;
+  std::vector<PaddleTensor> output_slots;
+
+  int num_batches = 0;
+  for (int t = 0; t < FLAGS_repeat; t++) {
+    DataReader reader(FLAGS_infer_data);
+    while (reader.NextBatch(&input, FLAGS_batch_size)) {
+      if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break;
+      timer.tic();
+      CHECK(predictor->Run(input_slots, &output_slots));
+      sum += timer.toc();
+      ++num_batches;
+    }
+  }
+  PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
+
+  // Get output
+  LOG(INFO) << "get outputs " << output_slots.size();
+
+  for (auto &output : output_slots) {
+    LOG(INFO) << "output.shape: " << to_string(output.shape);
+    // no lod ?
+    CHECK_EQ(output.lod.size(), 0UL);
+    LOG(INFO) << "output.dtype: " << output.dtype;
+    std::stringstream ss;
+    for (int i = 0; i < 5; i++) {
+      ss << static_cast<float *>(output.data.data())[i] << " ";
+    }
+    LOG(INFO) << "output.data summary: " << ss.str();
+    // one batch ends
+  }
+}
+
+TEST(text_classification, basic) { Main(FLAGS_batch_size); }
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 3a4ffe967e67ab0487192bbf12d4d5a15f536aa3..e8fb0775b45761f64fd6fd28306c35b76d1e40c4 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -67,7 +67,7 @@ struct Argument {
     PADDLE_ENFORCE(!attrs_.count(key), "Duplicate set Argument's attr [%s]",
                    key);
     attrs_[key] = data;
-    attr_deleters_[key] = [data, key, this]() {
+    attr_deleters_[key] = [data, key]() {
       VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
       VLOG(3) << "argument delete attr: " << key;
       delete data;
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 80c85555e722433f3657e880520b3fe459f6ce1a..8579845d51e80d73d220465d25b70944f5ad9bf2 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -263,7 +263,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }  // namespace
 
-Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
       FLAGS_IA_graphviz_log_root,
       "data_flow_graph_to_fluid_graphviz_debugger"));
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index 0c9a8a0b7cae17bf2eaa714348ea1c9b5e43611b..891c7226e245fa3b92892785362c186185a61f62 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -21,8 +21,8 @@
 
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
 namespace inference {
@@ -42,7 +42,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
     return "Transform a DFG to a Fluid ProgramDesc";
   }
 
-  Pass *CreateGraphvizDebugerPass() const override;
+  AnalysisPass *CreateGraphvizDebugerPass() const override;
 
  protected:
   // Add a Fluid Op into the ProgramDesc.
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
index 17445ab4407a159ca11345bc9a9226b3ad0044f0..e537bfc0e64d4ff46b3d61499a1a0298ed83533f 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -21,8 +21,8 @@ limitations under the License. */
 
 #include <fstream>
 #include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/dot.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/flags.h b/paddle/fluid/inference/analysis/flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..717e543f01dfa071865a5c14c0b7679e65239daf
--- /dev/null
+++ b/paddle/fluid/inference/analysis/flags.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+
+// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
+// flag if not available.
+DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
+DECLARE_string(IA_graphviz_log_root);
+DECLARE_string(IA_output_storage_path);
+DECLARE_bool(IA_enable_ir);
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 51bd0ac42d455f68ac5d70f0ce9703dfad6070d4..2b7d632c839e735ca03c6e17b94307b40cc13374 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -66,7 +66,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }
 
-Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
       FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
 }
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index fb948bf2242abcbc1e841fd3b8457e63358782c5..b9e262020e9522e167b998d57e2be2ac19b48447 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -22,8 +22,8 @@
 #include <string>
 
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
 namespace inference {
@@ -46,7 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
     return "transform a fluid ProgramDesc to a data flow graph.";
   }
 
-  Pass *CreateGraphvizDebugerPass() const override;
+  AnalysisPass *CreateGraphvizDebugerPass() const override;
 
  private:
   framework::proto::ProgramDesc const *desc_;
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
index 6731b1f759363eec5dd8645783212a72ace67b2f..c2599e218a2306f9353b843b7ea3f18aeacb008e 100644
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
@@ -14,14 +14,17 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
-using namespace framework;
 
 static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__";
 
@@ -47,7 +50,8 @@ class FluidToIrPass final : public DataFlowGraphPass {
     ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
     // Load program.
     auto program = LoadProgramDesc(*argument->fluid_model_program_path);
-    argument->origin_program_desc.reset(new proto::ProgramDesc(program));
+    argument->origin_program_desc.reset(
+        new framework::proto::ProgramDesc(program));
     // Create main data flow graph.
     if (!argument->main_dfg) {
       argument->main_dfg.reset(new DataFlowGraph);
@@ -77,27 +81,30 @@ class FluidToIrPass final : public DataFlowGraphPass {
     IRPassManager ir_passes(argument_->Get<ProgramDesc>("ir_program_desc"),
                             nullptr);
     // Pass the scope from analysis to IR if needed.
-    if (argument_->Has(ir::kParamScopeAttr)) {
+    if (argument_->Has(framework::ir::kParamScopeAttr)) {
       // Here the address is passed, attention that IR doesn't own the scope, so
       // the real scope in analysis should live during the IR phase.
       ir_passes.graph().Set(
-          ir::kParamScopeAttr,
-          new Scope *(&argument_->Get<Scope>(ir::kParamScopeAttr)));
+          framework::ir::kParamScopeAttr,
+          new framework::Scope *(&argument_->Get<framework::Scope>(
+              framework::ir::kParamScopeAttr)));
     }
 
-    const auto &ir_passes_to_apply =
-        argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
-    ir_passes.Apply(ir_passes_to_apply);
+    if (FLAGS_IA_enable_ir) {
+      const auto &ir_passes_to_apply =
+          argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
+      ir_passes.Apply(ir_passes_to_apply);
+    }
 
     PADDLE_ENFORCE(argument_->main_dfg.get());
     argument_->main_dfg->Build(ir_passes.graph());
     // inherit the arguments from ir.
-    if (ir_passes.graph().Has(ir::kFuseStatisAttr)) {
+    if (ir_passes.graph().Has(framework::ir::kFuseStatisAttr)) {
       argument_->Set(
-          ir::kFuseStatisAttr,
+          framework::ir::kFuseStatisAttr,
           new std::unordered_map<std::string, int>(
               ir_passes.graph().Get<std::unordered_map<std::string, int>>(
-                  ir::kFuseStatisAttr)));
+                  framework::ir::kFuseStatisAttr)));
     }
   }
 
@@ -109,7 +116,7 @@ class FluidToIrPass final : public DataFlowGraphPass {
 
  private:
   // Load parameters from a single file or from a directory.
-  bool LoadParams(Scope *scope, const std::string &dir,
+  bool LoadParams(framework::Scope *scope, const std::string &dir,
                   const std::string &prog_file, const std::string &param_file);
 
  private:
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
index 6a13c60e7b2ebf645b12d5ddf83ef6ab3a2e83bd..367c25805d05f8d10fb8341158760ac6356a5c48 100644
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
@@ -16,6 +16,7 @@
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 
 namespace paddle {
 namespace inference {
@@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) {
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
-
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
-USE_PASS(attention_lstm_fuse_pass);
-USE_PASS(fc_lstm_fuse_pass);
-USE_PASS(seq_concat_fc_fuse_pass);
-USE_PASS(fc_fuse_pass);
diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h
index 3a2869e30bd80cfd0756f8e21acb414656620eaa..f14b49e09c2f8e79c6fc4accdbf17f4f7a9bb1a3 100644
--- a/paddle/fluid/inference/analysis/model_store_pass.h
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -19,7 +19,7 @@
 
 #pragma once
 #include <string>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
index ff5ec94265a4f05c1294ad6c8ac5f86c249b84b6..759b2b96a1944c060ac98b6865b58ba2f6369607 100644
--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -40,17 +40,6 @@ void DfgPassManager::RunAll() {
   }
 }
 
-void NodePassManager::RunAll() {
-  PADDLE_ENFORCE(argument_);
-  PADDLE_ENFORCE(argument_->main_dfg.get());
-  auto trait = GraphTraits<DataFlowGraph>(*argument_->main_dfg).nodes_in_DFS();
-  for (auto& node : trait) {
-    for (auto& pass : data_) {
-      pass->Run(&node);
-    }
-  }
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager.h b/paddle/fluid/inference/analysis/pass_manager.h
index 81a17e0287a5aef8a328e43380ee3691f5a32379..412747c4fcce73303703f586f7a04edf4cc5ee76 100644
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ b/paddle/fluid/inference/analysis/pass_manager.h
@@ -33,7 +33,7 @@ limitations under the License. */
 
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
 namespace inference {
@@ -43,7 +43,7 @@ namespace analysis {
  * PassManager is the base class for all pass managers, a pass manager has
  * several Pass-es registered, and execute them in the linear order.
  */
-class PassManager : public OrderedRegistry<Pass> {
+class PassManager : public OrderedRegistry<AnalysisPass> {
  public:
   PassManager() = default;
   // Call all the passes' Initialize methods. The desc and data_flow_graph are
@@ -89,18 +89,6 @@ class DfgPassManager : public PassManager {
   virtual ~DfgPassManager() = default;
 };
 
-/*
- * A pass manager that process a Node each time.
- */
-class NodePassManager : public PassManager {
- public:
-  NodePassManager() = default;
-
-  void RunAll() override;
-
-  virtual ~NodePassManager() = default;
-};
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index 13423e4837e12a96e7a5dfc9ca3f59bf8b14746a..72b0fbf7e571ec97a0ea093d01449c1d5ddb9b91 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -34,28 +34,6 @@ class TestDfgPassManager final : public DfgPassManager {
   std::string description() const override { return "test doc"; }
 };
 
-class TestNodePassManager final : public NodePassManager {
- public:
-  virtual ~TestNodePassManager() = default;
-
-  std::string repr() const override { return "test-node-pass-manager"; }
-  std::string description() const override { return "test doc"; }
-};
-
-class TestNodePass final : public NodePass {
- public:
-  virtual ~TestNodePass() = default;
-
-  bool Initialize(Argument* argument) override { return true; }
-
-  void Run(Node* node) override {
-    LOG(INFO) << "- Processing node " << node->repr();
-  }
-
-  std::string repr() const override { return "test-node"; }
-  std::string description() const override { return "some doc"; }
-};
-
 TEST(PassManager, DFG_pass_manager) {
   TestDfgPassManager manager;
   DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
@@ -71,19 +49,6 @@ TEST(PassManager, DFG_pass_manager) {
   manager.RunAll();
 }
 
-TEST(PassManager, Node_pass_manager) {
-  Argument argument(FLAGS_inference_model_dir);
-  // Pre-process: initialize the DFG with the ProgramDesc first.
-  FluidToDataFlowGraphPass pass0;
-  pass0.Initialize(&argument);
-  pass0.Run(argument.main_dfg.get());
-
-  TestNodePassManager manager;
-  manager.Register("test-node-pass", new TestNodePass);
-  ASSERT_TRUE(manager.Initialize(&argument));
-  manager.RunAll();
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
index 9f51fafe0b2a66f9d062a6b751fe7a3bc662ce7c..174c8513f92cf869419f04cab5a54af65e9673b8 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -68,7 +68,7 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass {
   }
 };
 
-Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
   DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
                                       "tensorrt_marked_node");
   return new DfgDebuggerPass(config);
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
index c558a6ebbde371071c7330a14cc986bf764d1773..c881a54c240538b68abdcb9060db69de3bf2b8bb 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <string>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
 
 namespace paddle {
@@ -48,7 +48,7 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
     return "tensorrt sub-graph mark pass";
   }
 
-  Pass* CreateGraphvizDebugerPass() const override;
+  AnalysisPass* CreateGraphvizDebugerPass() const override;
   bool Finalize() override;
 
  private:
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
index c6741a92095d33d261a4e1667c87a8ca02e51a9f..219e3f5470f627e81005aabf94f9c72c33fd2eed 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/node.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index adfe4392448557a30cd834022b9a5d21d9086b95..5df486f345a98d7737d326c94e4854d24535ff61 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,10 +18,7 @@ if(APPLE)
 endif(APPLE)
 
 
-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
-  graph_viz_pass fc_fuse_pass
-  infer_clean_graph_pass
-  )
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB})
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
@@ -47,8 +44,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api)
-
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc
         DEPS paddle_inference_api)
@@ -64,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
 inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()
 
-if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
+if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
     # compile the libinference_anakin_api.a and anakin.so.
     cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
     cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
@@ -74,12 +70,24 @@ if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
     anakin_target(inference_anakin_api)
     anakin_target(inference_anakin_api_shared)
     if (WITH_TESTING)
-        cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
-                ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
-                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+        # TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
+        set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
+        set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
+        set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
+        execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
+        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
+        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
+        if(WITH_GPU)
+            set(anakin_test_extra_deps dynload_cuda)
+            set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
+            execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
+            cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
+                    ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
+                    DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
+        endif()
         cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc 
                 ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
                      --datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
-                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+                DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
     endif(WITH_TESTING)
 endif()
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 33862232bdaae817b9ca72879605386c32ed3e8b..2a9a7aed480e76edbac4d5ba6d7bc3b8b2dc5006 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -14,23 +14,40 @@
 
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DECLARE_bool(profile);
 
 namespace paddle {
 
 bool AnalysisPredictor::Init(
     const std::shared_ptr<framework::Scope>& parent_scope) {
   VLOG(3) << "Predictor::init()";
+#if !defined(_WIN32)
+  if (FLAGS_profile) {
+    LOG(WARNING) << "Profiler is actived, might affect the performance";
+    LOG(INFO) << "You can turn off by set gflags '-profile false'";
+    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
+                                           : platform::ProfilerState::kCPU;
+    platform::EnableProfiler(tracking_device);
+  }
+#endif
+
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
+    LOG(WARNING) << "ir optimize only supports CPU currently";
+    config_.enable_ir_optim = false;
   } else {
     place_ = paddle::platform::CPUPlace();
   }
-  PADDLE_ENFORCE(!parent_scope);
   if (parent_scope) {
     scope_ = parent_scope;
     sub_scope_ = &(parent_scope->NewScope());
@@ -72,7 +89,7 @@ bool AnalysisPredictor::Init(
 
 void AnalysisPredictor::OptimizeInferenceProgram() {
   LOG(INFO) << "optimize begin";
-  FLAGS_IA_enable_ir = true;
+  FLAGS_IA_enable_ir = config_.enable_ir_optim;
   FLAGS_IA_enable_tensorrt_subgraph_engine = false;
   FLAGS_IA_output_storage_path = "";  // Don't output the model.
   // Analyze inference_program
@@ -89,24 +106,26 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   }
   argument_.origin_program_desc.reset(
       new ProgramDesc(*inference_program_->Proto()));
-  Analyzer().Run(&argument_);
+  PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude,
+                 "Only kExclude is supported yet.");
+  Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
+
   CHECK(argument_.transformed_program_desc);
   VLOG(5) << "to prepare executor";
-  // LOG(INFO) << "transformed_parogram_desc " <<
-  // argument.transformed_program_desc->DebugString();
   inference_program_.reset(
       new framework::ProgramDesc(*argument_.transformed_program_desc));
-  PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr));
-  // Update scope.
-  scope_.reset(
-      argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
-  LOG(INFO) << "optimize end ==";
+  if (argument_.Has(framework::ir::kParamScopeAttr)) {
+    // Update scope.
+    scope_.reset(
+        argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
+  }
+  LOG(INFO) << "== optimize end ==";
 }
 
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) {
-  VLOG(3) << "create NativePredictor";
+    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) {
+  VLOG(3) << "create AnalysisConfig";
   if (config.use_gpu) {
     // 1. GPU memeroy
     PADDLE_ENFORCE_GT(
@@ -133,7 +152,3 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
 }
 
 }  // namespace paddle
-
-USE_PASS(fc_fuse_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index e32b6185f6044ab3577bde0a8f8dcf2391688aa8..e53925366e9214cd60422efe56884751297c15e5 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -28,7 +30,7 @@ using framework::proto::ProgramDesc;
  */
 class AnalysisPredictor : public NativePaddlePredictor {
  public:
-  explicit AnalysisPredictor(const NativeConfig& config)
+  explicit AnalysisPredictor(const AnalysisConfig& config)
       : NativePaddlePredictor(config), config_(config) {}
 
   bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
@@ -44,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor {
   Argument& analysis_argument() { return argument_; }
 
  private:
-  NativeConfig config_;
+  AnalysisConfig config_;
   Argument argument_;
 };
 
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
index ea66aa89b87ba3c25cdcd5eb2c5155a481ef7987..43b31269d2bd82c06e284e3599a3763da693a2af 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -193,7 +193,9 @@ PaddleInferenceAnakinPredictor<Target>::Clone() {
   return std::move(cls);
 }
 
+#ifdef PADDLE_WITH_CUDA
 template class PaddleInferenceAnakinPredictor<anakin::NV>;
+#endif
 template class PaddleInferenceAnakinPredictor<anakin::X86>;
 
 // A factory to help create difference predictor.
@@ -202,10 +204,15 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
   VLOG(3) << "Anakin Predictor create.";
   if (config.target_type == AnakinConfig::NVGPU) {
+#ifdef PADDLE_WITH_CUDA
     VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
     std::unique_ptr<PaddlePredictor> x(
         new PaddleInferenceAnakinPredictor<anakin::NV>(config));
     return x;
+#else
+    LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment";
+    return nullptr;
+#endif
   } else if (config.target_type == AnakinConfig::X86) {
     VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
     std::unique_ptr<PaddlePredictor> x(
diff --git a/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
index 6183864234e85b89e94821890d9606b082c59233..98c74aaa562dce6618ccde8f11f4344eefd59ef2 100644
--- a/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
@@ -20,71 +20,16 @@ limitations under the License. */
 #include <iostream>
 #include <thread>  // NOLINT
 #include <vector>
-#include "framework/core/net/net.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/timer.h"
+#include "utils/logger/logger.h"
 
 DEFINE_string(model, "", "Directory of the inference model.");
 DEFINE_string(datapath, "", "Path of the dataset.");
 DEFINE_int32(batch_size, 1, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
 
-// Timer for timer
-class Timer {
- public:
-  double start;
-  double startu;
-  void tic() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    start = tp.tv_sec;
-    startu = tp.tv_usec;
-  }
-  double toc() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    double used_time_ms =
-        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
-    return used_time_ms;
-  }
-};
-
-std::vector<std::string> string_split(std::string in_str,
-                                      std::string delimiter) {
-  std::vector<std::string> seq;
-  int found = in_str.find(delimiter);
-  int pre_found = -1;
-  while (found != std::string::npos) {
-    if (pre_found == -1) {
-      seq.push_back(in_str.substr(0, found));
-    } else {
-      seq.push_back(in_str.substr(pre_found + delimiter.length(),
-                                  found - delimiter.length() - pre_found));
-    }
-    pre_found = found;
-    found = in_str.find(delimiter, pre_found + delimiter.length());
-  }
-  seq.push_back(
-      in_str.substr(pre_found + 1, in_str.length() - (pre_found + 1)));
-  return seq;
-}
-std::vector<std::string> string_split(
-    std::string in_str, std::vector<std::string>& delimiter) {  // NOLINT
-  std::vector<std::string> in;
-  std::vector<std::string> out;
-  out.push_back(in_str);
-  for (auto del : delimiter) {
-    in = out;
-    out.clear();
-    for (auto s : in) {
-      auto out_s = string_split(s, del);
-      for (auto o : out_s) {
-        out.push_back(o);
-      }
-    }
-  }
-  return out;
-}
-
 class Data {
  public:
   Data(std::string file_name, int batch_size)
@@ -120,36 +65,24 @@ void Data::get_batch_data(
   week_fea.clear();
   time_fea.clear();
   while (_file.getline(buf, 10000)) {
-    std::string s = buf;
-    std::vector<std::string> deli_vec = {":"};
-    std::vector<std::string> data_vec = string_split(s, deli_vec);
+    std::vector<std::string> data_vec;
+    paddle::inference::split(buf, ':', &data_vec);
 
     std::vector<std::string> seq;
-    seq = string_split(data_vec[0], {"|"});
+    paddle::inference::split(data_vec[0], '|', &seq);
 
     for (auto link : seq) {
-      std::vector<std::string> data = string_split(link, ",");
       std::vector<float> vec;
-      for (int i = 0; i < data.size(); i++) {
-        vec.push_back(atof(data[i].c_str()));
-      }
+      paddle::inference::split_to_float(link, ',', &vec);
       fea.push_back(vec);
     }
-    std::vector<std::string> week_data;
-    std::vector<std::string> time_data;
 
-    week_data = string_split(data_vec[2], ",");
     std::vector<float> vec_w;
-    for (int i = 0; i < week_data.size(); i++) {
-      vec_w.push_back(atof(week_data[i].c_str()));
-    }
+    paddle::inference::split_to_float(data_vec[2], ',', &vec_w);
     week_fea.push_back(vec_w);
 
-    time_data = string_split(data_vec[1], ",");
     std::vector<float> vec_t;
-    for (int i = 0; i < time_data.size(); i++) {
-      vec_t.push_back(atof(time_data[i].c_str()));
-    }
+    paddle::inference::split_to_float(data_vec[1], ',', &vec_t);
     time_fea.push_back(vec_t);
 
     cum += seq.size();
@@ -275,14 +208,13 @@ void single_test() {
     inputs.push_back(tensor_2);
     inputs.push_back(tensor_0);
 
-    Timer timer;
+    paddle::inference::Timer timer;
     timer.tic();
     for (int i = 0; i < FLAGS_repeat; i++) predictor->Run(inputs, &outputs);
 
-    LOG(INFO) << "batch_size = " << FLAGS_batch_size
-              << ", repeat = " << FLAGS_repeat
-              << ", sequence_length = " << seq_offset[seq_offset.size() - 1]
-              << ", latency: " << timer.toc() / FLAGS_repeat << "ms";
+    paddle::inference::PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0,
+                                 timer.toc() / FLAGS_repeat);
+    LOG(INFO) << "sequence_length = " << seq_offset[seq_offset.size() - 1];
 
     float* data_o = static_cast<float*>(outputs[0].data.data());
     VLOG(3) << "outputs[0].data.length() = " << outputs[0].data.length();
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index da1c0b1fbc9b2d025422deb11368e4070bffdc74..bd9b4b1a814f995e3979105f5b9830b95fd8ea7d 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <sys/time.h>
 #include <algorithm>
 #include <map>
 #include <set>
@@ -23,32 +22,14 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/timer.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(profile, false, "Turn on profiler for fluid");
 
 namespace paddle {
 namespace {
-
-// Timer for timer
-class Timer {
- public:
-  double start;
-  double startu;
-  void tic() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    start = tp.tv_sec;
-    startu = tp.tv_usec;
-  }
-  double toc() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    double used_time_ms =
-        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
-    return used_time_ms;
-  }
-};
+using paddle::inference::Timer;
 
 template <class T>
 std::string num2str(T a) {
@@ -69,7 +50,7 @@ void NativePaddlePredictor::PrepareFeedFetch() {
       feed_names_[op->Output("Out")[0]] = idx;
     } else if (op->Type() == "fetch") {
       int idx = boost::get<int>(op->GetAttr("col"));
-      if (fetchs_.size() <= idx) {
+      if (fetchs_.size() <= static_cast<size_t>(idx)) {
         fetchs_.resize(idx + 1);
       }
       fetchs_[idx] = op;
@@ -80,7 +61,7 @@ void NativePaddlePredictor::PrepareFeedFetch() {
 bool NativePaddlePredictor::Init(
     std::shared_ptr<framework::Scope> parent_scope) {
   VLOG(3) << "Predictor::init()";
-
+#if !defined(_WIN32)
   if (FLAGS_profile) {
     LOG(WARNING) << "Profiler is actived, might affect the performance";
     LOG(INFO) << "You can turn off by set gflags '-profile false'";
@@ -89,6 +70,7 @@ bool NativePaddlePredictor::Init(
                                            : platform::ProfilerState::kCPU;
     platform::EnableProfiler(tracking_device);
   }
+#endif
 
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
@@ -133,10 +115,12 @@ bool NativePaddlePredictor::Init(
 }
 
 NativePaddlePredictor::~NativePaddlePredictor() {
+#if !defined(_WIN32)
   if (FLAGS_profile) {
     platform::DisableProfiler(platform::EventSortingKey::kTotal,
                               "./profile.log");
   }
+#endif
   if (sub_scope_) {
     scope_->DeleteScope(sub_scope_);
   }
@@ -179,15 +163,21 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
     LOG(ERROR) << "fail to call Init";
     return nullptr;
   }
+#ifdef __clang__
+  // fix clang compile error
+  return cls;
+#else
   // fix manylinux compile error.
   return std::move(cls);
+#endif
 }
 
 bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                     framework::Scope *scope) {
   VLOG(3) << "Predictor::set_feed";
   if (inputs.size() != feeds_.size()) {
-    LOG(ERROR) << "wrong feed input size.";
+    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
+               << inputs.size();
     return false;
   }
   for (size_t i = 0; i < inputs.size(); ++i) {
@@ -222,6 +212,62 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
   }
   return true;
 }
+template <typename T>
+void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
+                                        PaddleTensor *output) {
+  std::vector<int> shape;
+  auto dims_i = fetch.dims();
+  auto lod = fetch.lod();
+  const T *output_ptr = fetch.data<T>();
+  auto num = fetch.numel();
+  std::vector<T> data;
+  if (0 == lod.size()) {
+    std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
+    for (int j = 0; j < dims_i.size(); ++j) {
+      shape.push_back(dims_i[j]);
+    }
+  } else {
+    // for batch detection
+    // image[0] -> output[0] shape {145, 6}
+    // image[1] -> output[1] shape {176, 6}
+    // then,
+    // the batch output shape {321, 6}
+    // the lod {{0, 145, 321}}
+    // so we should append output[0] to {176, 6}
+    size_t max_dim = 0;
+    for (size_t j = 1; j < lod[0].size(); j++) {
+      max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
+    }
+    size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
+    if (max_dim > 0) {
+      data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
+    }
+    for (size_t j = 1; j < lod[0].size(); j++) {
+      size_t start = lod[0][j - 1] * common_dim;
+      size_t end = lod[0][j] * common_dim;
+      if (end > start) {
+        std::copy(output_ptr + start, output_ptr + end,
+                  data.begin() + (j - 1) * max_dim * common_dim);
+      }
+    }
+    shape.push_back(lod[0].size() - 1);
+    shape.push_back(max_dim);
+    for (int j = 1; j < dims_i.size(); ++j) {
+      shape.push_back(dims_i[j]);
+    }
+  }
+
+  output->shape = shape;
+  auto &buffer = output->data;
+  if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) {
+    buffer.Resize(sizeof(T) * data.size());
+  }
+  std::memcpy(buffer.data(), data.data(), buffer.length());
+  // copy LoD
+  for (const auto &level : fetch.lod()) {
+    output->lod.emplace_back(level);
+  }
+}
 
 bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                      framework::Scope *scope) {
@@ -229,70 +275,20 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   outputs->resize(fetchs_.size());
   for (size_t i = 0; i < fetchs_.size(); ++i) {
     int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
-    PADDLE_ENFORCE(idx == i);
-    framework::LoDTensor &output =
+    PADDLE_ENFORCE((size_t)idx == i);
+    framework::LoDTensor &fetch =
         framework::GetFetchVariable(*scope, "fetch", idx);
-    // TODO(panyx0718): Support fetch of other types.
-    if (output.type() != typeid(float)) {
-      LOG(ERROR) << "only support fetching float now.";
-      return false;
-    }
-
-    std::vector<int> shape;
-    auto dims_i = output.dims();
-    auto lod = output.lod();
-    const float *output_ptr = output.data<float>();
-    // const int64_t* output_ptr = fetchs[i].data<int64_t>();
-    auto num = output.numel();
-    std::vector<float> data;
-    if (0 == lod.size()) {
-      std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
-      for (int j = 0; j < dims_i.size(); ++j) {
-        shape.push_back(dims_i[j]);
-      }
+    auto type = fetch.type();
+    auto output = &(outputs->at(i));
+    if (type == typeid(float)) {
+      GetFetchOne<float>(fetch, output);
+      output->dtype = PaddleDType::FLOAT32;
+    } else if (type == typeid(int64_t)) {
+      GetFetchOne<int64_t>(fetch, output);
+      output->dtype = PaddleDType::INT64;
     } else {
-      // for batch detection
-      // image[0] -> output[0] shape {145, 6}
-      // image[1] -> output[1] shape {176, 6}
-      // then,
-      // the batch output shape {321, 6}
-      // the lod {{0, 145, 321}}
-      // so we should append output[0] to {176, 6}
-      size_t max_dim = 0;
-      for (size_t j = 1; j < lod[0].size(); j++) {
-        max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
-      }
-      size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
-      if (max_dim > 0) {
-        data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
-      }
-      for (size_t j = 1; j < lod[0].size(); j++) {
-        size_t start = lod[0][j - 1] * common_dim;
-        size_t end = lod[0][j] * common_dim;
-        if (end > start) {
-          std::copy(output_ptr + start, output_ptr + end,
-                    data.begin() + (j - 1) * max_dim * common_dim);
-        }
-      }
-      shape.push_back(lod[0].size() - 1);
-      shape.push_back(max_dim);
-      for (int j = 1; j < dims_i.size(); ++j) {
-        shape.push_back(dims_i[j]);
-      }
-    }
-
-    outputs->at(i).shape = shape;
-    auto &buffer = outputs->at(i).data;
-    if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) {
-      buffer.Resize(sizeof(float) * data.size());
-    }
-    std::memcpy(buffer.data(), data.data(), buffer.length());
-    // copy LoD
-    for (const auto &level : output.lod()) {
-      outputs->at(i).lod.emplace_back(level);
+      LOG(ERROR) << "unknown type, only support float32 and int64 now.";
     }
-    outputs->at(i).dtype = PaddleDType::FLOAT32;
-    // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
   }
   return true;
 }
@@ -323,7 +319,12 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
     return nullptr;
   }
+#ifdef __clang__
+  // fix clang compile error
+  return predictor;
+#else
   return std::move(predictor);
+#endif
 }
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 4eff9204eba987aed11e4066fa7b6f6cc610a763..ec801c58857e716241d28404510530e551ed25aa 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -51,7 +51,9 @@ class NativePaddlePredictor : public PaddlePredictor {
                framework::Scope *scope);
   bool GetFetch(std::vector<PaddleTensor> *output_data,
                 framework::Scope *scope);
-
+  template <typename T>
+  void GetFetchOne(const framework::LoDTensor &fetchs,
+                   PaddleTensor *output_data);
   void PrepareFeedFetch();
 
   NativeConfig config_;
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index a697218377e1e661dccc8d8a4c78f15b5c211243..afb46a7139f6ab8e6b3697fdc56fe1c78a05cd64 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -3,6 +3,11 @@ cmake_minimum_required(VERSION 3.0)
 project(cpp_inference_demo CXX C)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if (WIN32)
+set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
+else()
+set(CMAKE_STATIC_LIBRARY_PREFIX "")
+endif()
 
 if(NOT DEFINED PADDLE_LIB)
   message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
@@ -32,44 +37,56 @@ endif(NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/boost")
 include_directories("${PADDLE_LIB}/third_party/eigen3")
 
+if (NOT WIN32)
 link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
 link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+endif(NOT WIN32)
+
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
 link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
 link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+link_directories("${PADDLE_LIB}/paddle/fluid/inference")
 
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so 
-               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so)
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} 
+               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
   set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
   if(EXISTS ${MKLDNN_PATH})
     include_directories("${MKLDNN_PATH}/include")
     set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
   endif()
 else()
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
 endif()
 
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a)
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
 else()
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
-set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 
+if (NOT WIN32)
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 set(DEPS ${DEPS}
     ${MATH_LIB} ${MKLDNN_LIB}
     glog gflags protobuf snappystream snappy z
     ${EXTERNAL_LIB})
+else()
+set(DEPS ${DEPS}
+    ${MATH_LIB} ${MKLDNN_LIB}
+    ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
+    ${EXTERNAL_LIB})
+endif(NOT WIN32)
+
 if(WITH_GPU)
-  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so)
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
 
 target_link_libraries(${DEMO_NAME} ${DEPS})
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 7824ef2649af81a2390ff3bc537eb7c93c70e402..0f7d541c5edfc62e80cf50f83b491f06dcb42644 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -14,7 +14,7 @@ else
 fi
 
 PREFIX=inference-vis-demos%2F
-URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
+URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX}
 
 # download vis_demo data
 function download() {
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index e44b1b74bc385c015fa6efcebac05359a810cbc1..f6893be428feacbba85bab380e22972848eaeb93 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -14,36 +14,19 @@
 
 #pragma once
 
+#include <glog/logging.h>
 #include <sys/time.h>
 #include <algorithm>
+#include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/timer.h"
 
 namespace paddle {
 namespace inference {
 
-// Timer for timer
-class Timer {
- public:
-  double start;
-  double startu;
-  void tic() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    start = tp.tv_sec;
-    startu = tp.tv_usec;
-  }
-  double toc() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    double used_time_ms =
-        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
-    return used_time_ms;
-  }
-};
-
 static void split(const std::string &str, char sep,
                   std::vector<std::string> *pieces) {
   pieces->clear();
@@ -68,6 +51,13 @@ static void split_to_float(const std::string &str, char sep,
   std::transform(pieces.begin(), pieces.end(), std::back_inserter(*fs),
                  [](const std::string &v) { return std::stof(v); });
 }
+static void split_to_int64(const std::string &str, char sep,
+                           std::vector<int64_t> *is) {
+  std::vector<std::string> pieces;
+  split(str, sep, &pieces);
+  std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is),
+                 [](const std::string &v) { return std::stoi(v); });
+}
 template <typename T>
 std::string to_string(const std::vector<T> &vec) {
   std::stringstream ss;
@@ -84,15 +74,59 @@ template <>
 std::string to_string<std::vector<std::vector<float>>>(
     const std::vector<std::vector<std::vector<float>>> &vec);
 
-// clang-format off
-static void TensorAssignData(PaddleTensor *tensor, const std::vector<std::vector<float>> &data) {
+template <typename T>
+static void TensorAssignData(PaddleTensor *tensor,
+                             const std::vector<std::vector<T>> &data) {
   // Assign buffer
-  int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1, [](int a, int b) { return a * b; });
-  tensor->data.Resize(sizeof(float) * dim);
+  int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  tensor->data.Resize(sizeof(T) * dim);
   int c = 0;
   for (const auto &f : data) {
-    for (float v : f) { static_cast<float *>(tensor->data.data())[c++] = v; }
+    for (T v : f) {
+      static_cast<T *>(tensor->data.data())[c++] = v;
+    }
+  }
+}
+
+std::string DescribeTensor(const PaddleTensor &tensor) {
+  std::stringstream os;
+  os << "Tensor [" << tensor.name << "]\n";
+  os << " - type: ";
+  switch (tensor.dtype) {
+    case PaddleDType::FLOAT32:
+      os << "float32";
+      break;
+    case PaddleDType::INT64:
+      os << "int64";
+      break;
+    default:
+      os << "unset";
+  }
+  os << '\n';
+
+  os << " - shape: " << to_string(tensor.shape) << '\n';
+  os << " - lod: ";
+  for (auto &l : tensor.lod) {
+    os << to_string(l) << "; ";
   }
+  os << "\n";
+  os << " - data: ";
+
+  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  for (int i = 0; i < dim; i++) {
+    os << static_cast<float *>(tensor.data.data())[i] << " ";
+  }
+  os << '\n';
+  return os.str();
+}
+
+void PrintTime(int batch_size, int repeat, int num_threads, int tid,
+               double latency) {
+  LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
+            << ", threads: " << num_threads << ", thread id: " << tid
+            << ", latency: " << latency << "ms ======";
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 1baa64c249f291ec1bc874be5031abe6d4368274..995da11e4a30eca72a91a53d3293aa8b033b012b 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -150,6 +150,21 @@ struct TensorRTConfig : public NativeConfig {
   int workspace_size{1 << 30};
 };
 
+// NOTE WIP, not stable yet.
+struct AnalysisConfig : public NativeConfig {
+  //
+  enum class IrPassMode {
+    kSystem,   // Use system default passes, not customize.
+    kInclude,  // Specify the passes in `ir_passes`.
+    kExclude   // Specify the disabled passes in `ir_passes`.
+  };
+
+  bool enable_ir_optim = true;
+  IrPassMode ir_mode{IrPassMode::kExclude};
+  // attention lstm fuse works only on some specific models, disable as default.
+  std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
+};
+
 // A factory to help create different predictors.
 //
 // FOR EXTENSION DEVELOPER:
diff --git a/paddle/fluid/inference/api/timer.h b/paddle/fluid/inference/api/timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2df5274dc1f2e7ad8e434f1da9d5ae6aee94c784
--- /dev/null
+++ b/paddle/fluid/inference/api/timer.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <chrono>  // NOLINT
+
+namespace paddle {
+namespace inference {
+
+// Timer for timer
+class Timer {
+ public:
+  std::chrono::high_resolution_clock::time_point start;
+  std::chrono::high_resolution_clock::time_point startu;
+
+  void tic() { start = std::chrono::high_resolution_clock::now(); }
+  double toc() {
+    startu = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> time_span =
+        std::chrono::duration_cast<std::chrono::duration<double>>(startu -
+                                                                  start);
+    double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
+    return used_time_ms;
+  }
+};
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/paddle_fluid.map b/paddle/fluid/inference/paddle_fluid.map
index 5203784dc1fcb672eb6a26d9dfd3ffbe02e08038..7e5cae04b81e6ce759b92f6c4b921ecf974e8260 100644
--- a/paddle/fluid/inference/paddle_fluid.map
+++ b/paddle/fluid/inference/paddle_fluid.map
@@ -1,6 +1,7 @@
 {
 	global:
 		*paddle*;
+                *Pass*;
 	local:
 		*;
 };
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 2f8fadcefe5d5b6f428092914c7e999ec7524862..7ec1e78da4ec642cb1e6248edfbcfed748fa11b8 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -178,6 +178,8 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(relu);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
         file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+      elseif(${TARGET} STREQUAL "fake_quantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
       elseif(${TARGET} STREQUAL "tensorrt_engine_op")
           message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
       elseif(${TARGET} STREQUAL "fc")
@@ -293,6 +295,7 @@ op_library(extract_rows_op DEPS memory)
 op_library(flatten_op DEPS reshape_op)
 op_library(sequence_pad_op DEPS sequence_padding)
 op_library(unstack_op DEPS stack_op)
+op_library(fake_quantize_op DEPS memory)
 
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 912415192659dc004f54a76e9cd1a20581d512a6..2e31d1c9c708225135e27c93ba94722794c4b282 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -865,8 +865,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = static_cast<T>(1) /
                  (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
-    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
-    dx.device(d) = dout * ((beta * out) + temp2);
+    auto temp2 = temp1 * (static_cast<T>(1) - (static_cast<T>(beta) * out));
+    dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
   }
 };
 
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index a02128c5a54c80ca7ccf9db347cd53f28bbb50f8..39b0c856996c11c6efdb530f1396afd5731c778d 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
-#include <sys/time.h>
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
index 5edecd18e673da326ec119cf9a383f24f8045089..dfaa7456f917c1308984b361afed752f96ea6f59 100644
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/auc_op.h"
-#include <string>
 
 namespace paddle {
 namespace operators {
@@ -36,15 +35,12 @@ class AucOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(predict_height, label_height,
                       "Out and Label should have same height.");
 
-    int num_thres = ctx->Attrs().Get<int>("num_thresholds");
+    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
 
     ctx->SetOutputDim("AUC", {1});
-    ctx->SetOutputDim("TPOut", {num_thres});
-    ctx->SetOutputDim("TNOut", {num_thres});
-    ctx->SetOutputDim("FPOut", {num_thres});
-    ctx->SetOutputDim("FNOut", {num_thres});
-
-    ctx->ShareLoD("Predict", /*->*/ "AUC");
+    ctx->SetOutputDim("BatchAUC", {1});
+    ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
+    ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
   }
 
  protected:
@@ -66,25 +62,24 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Label",
              "A 2D int tensor indicating the label of the training data. "
              "shape: [batch_size, 1]");
-    AddInput("TP", "True-Positive value.");
-    AddInput("FP", "False-Positive value.");
-    AddInput("TN", "True-Negative value.");
-    AddInput("FN", "False-Negative value.");
     // TODO(typhoonzero): support weight input
+    AddInput("StatPos", "Statistic value when label = 1");
+    AddInput("StatNeg", "Statistic value when label = 0");
+
     AddOutput("AUC",
               "A scalar representing the "
               "current area-under-the-curve.");
-    AddOutput("TPOut", "True-Positive value.");
-    AddOutput("FPOut", "False-Positive value.");
-    AddOutput("TNOut", "True-Negative value.");
-    AddOutput("FNOut", "False-Negative value.");
+    AddOutput("BatchAUC", "The AUC for current batch");
+    AddOutput("StatPosOut", "Statistic value when label = 1");
+    AddOutput("StatNegOut", "Statistic value when label = 0");
 
     AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
         .SetDefault("ROC");
+
     AddAttr<int>("num_thresholds",
                  "The number of thresholds to use when discretizing the"
                  " roc curve.")
-        .SetDefault(200);
+        .SetDefault((2 << 12) - 1);
 
     AddComment(R"DOC(
 Area Under The Curve (AUC) Operator.
diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h
index 0a18585edb54a76aff5ae72ecc71e0eebb9f9361..fb0517d70635e090f8c5b59ff9d8420fc34c747b 100644
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -23,106 +23,85 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
 template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* predict = ctx.Input<Tensor>("Predict");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* auc = ctx.Output<Tensor>("AUC");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *predict = ctx.Input<Tensor>("Predict");
+    auto *label = ctx.Input<Tensor>("Label");
+
+    std::string curve = ctx.Attr<std::string>("curve");
+    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    int num_pred_buckets = num_thresholds + 1;
+
     // Only use output var for now, make sure it's persistable and
     // not cleaned up for each batch.
-    auto* true_positive = ctx.Output<Tensor>("TPOut");
-    auto* false_positive = ctx.Output<Tensor>("FPOut");
-    auto* true_negative = ctx.Output<Tensor>("TNOut");
-    auto* false_negative = ctx.Output<Tensor>("FNOut");
+    auto *auc = ctx.Output<Tensor>("AUC");
+    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
+    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
 
-    auto* auc_data = auc->mutable_data<double>(ctx.GetPlace());
+    auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
+    auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
+    calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
+            auc);
 
-    std::string curve = ctx.Attr<std::string>("curve");
-    int num_thresholds = ctx.Attr<int>("num_thresholds");
-    std::vector<double> thresholds_list;
-    thresholds_list.reserve(num_thresholds);
-    for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = static_cast<double>(i) / (num_thresholds - 1);
-    }
-    const double kEpsilon = 1e-7;
-    thresholds_list[0] = 0.0f - kEpsilon;
-    thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
+    auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
+    std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
+    std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
+    calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
+            num_thresholds, batch_auc);
+  }
 
+ private:
+  inline static double trapezoidArea(double X1, double X2, double Y1,
+                                     double Y2) {
+    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+  }
+
+  inline static void calcAuc(const framework::ExecutionContext &ctx,
+                             const framework::Tensor *label,
+                             const framework::Tensor *predict,
+                             int64_t *stat_pos, int64_t *stat_neg,
+                             int num_thresholds,
+                             framework::Tensor *auc_tensor) {
     size_t batch_size = predict->dims()[0];
     size_t inference_width = predict->dims()[1];
+    const T *inference_data = predict->data<T>();
+    const auto *label_data = label->data<int64_t>();
+
+    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
 
-    const T* inference_data = predict->data<T>();
-    const auto* label_data = label->data<int64_t>();
-
-    auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
-    auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
-    auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
-    auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace());
-
-    for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
-      // calculate TP, FN, TN, FP for current thresh
-      int64_t tp = 0, fn = 0, tn = 0, fp = 0;
-      for (size_t i = 0; i < batch_size; i++) {
-        // NOTE: label_data used as bool, labels > 0 will be treated as true.
-        if (label_data[i]) {
-          if (inference_data[i * inference_width + 1] >=
-              (thresholds_list[idx_thresh])) {
-            tp++;
-          } else {
-            fn++;
-          }
-        } else {
-          if (inference_data[i * inference_width + 1] >=
-              (thresholds_list[idx_thresh])) {
-            fp++;
-          } else {
-            tn++;
-          }
-        }
+    for (size_t i = 0; i < batch_size; i++) {
+      uint32_t binIdx = static_cast<uint32_t>(
+          inference_data[i * inference_width + 1] * num_thresholds);
+      if (label_data[i]) {
+        stat_pos[binIdx] += 1.0;
+      } else {
+        stat_neg[binIdx] += 1.0;
       }
-      // store rates
-      tp_data[idx_thresh] += tp;
-      fn_data[idx_thresh] += fn;
-      tn_data[idx_thresh] += tn;
-      fp_data[idx_thresh] += fp;
     }
-    // epsilon to avoid divide by zero.
-    double epsilon = 1e-6;
-    // Riemann sum to caculate auc.
-    Tensor tp_rate, fp_rate, rec_rate;
-    tp_rate.Resize({num_thresholds});
-    fp_rate.Resize({num_thresholds});
-    rec_rate.Resize({num_thresholds});
-    auto* tp_rate_data = tp_rate.mutable_data<double>(ctx.GetPlace());
-    auto* fp_rate_data = fp_rate.mutable_data<double>(ctx.GetPlace());
-    auto* rec_rate_data = rec_rate.mutable_data<double>(ctx.GetPlace());
-    for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
-                        (tp_data[i] + fn_data[i] + epsilon);
-      fp_rate_data[i] =
-          static_cast<double>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
-      rec_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
-                         (tp_data[i] + fp_data[i] + epsilon);
+
+    *auc = 0.0f;
+
+    double totPos = 0.0;
+    double totNeg = 0.0;
+    double totPosPrev = 0.0;
+    double totNegPrev = 0.0;
+
+    int idx = num_thresholds;
+
+    while (idx >= 0) {
+      totPosPrev = totPos;
+      totNegPrev = totNeg;
+      totPos += stat_pos[idx];
+      totNeg += stat_neg[idx];
+      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
+
+      --idx;
     }
-    *auc_data = 0.0f;
-    if (curve == "ROC") {
-      for (int i = 0; i < num_thresholds - 1; i++) {
-        auto dx = fp_rate_data[i] - fp_rate_data[i + 1];
-        auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
-      }
-    } else if (curve == "PR") {
-      for (int i = 1; i < num_thresholds; i++) {
-        auto dx = tp_rate_data[i] - tp_rate_data[i - 1];
-        auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
-      }
+
+    if (totPos > 0.0 && totNeg > 0.0) {
+      *auc = *auc / totPos / totNeg;
     }
   }
 };
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 10d678111f5325e495b24286e6ecf651230393fe..b6cb935814e25b31d4104f9ce24fe952680cb491 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -74,7 +74,7 @@ struct BeamSearchDecodeFunctor {
   }
 
   template <typename T>
-  void operator()() const;
+  void apply() const;
 
   bool tensor_on_gpu_;
   size_t beam_size_;
@@ -88,7 +88,7 @@ struct BeamSearchDecodeFunctor {
 };
 
 template <typename T>
-void BeamSearchDecodeFunctor::operator()() const {
+void BeamSearchDecodeFunctor::apply() const {
   BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
   // Check if the tensor is on GPU. If so, use the CPU copy instead
   if (tensor_on_gpu_) {
@@ -101,7 +101,7 @@ void BeamSearchDecodeFunctor::operator()() const {
 }
 
 template <>
-void BeamSearchDecodeFunctor::operator()<bool>() const {
+void BeamSearchDecodeFunctor::apply<bool>() const {
   PADDLE_THROW("beam search decode op does not support bool!");
 }
 
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 6220e57f5941d89cbf0aea268b85ad55af6132cc..8fa0416049f8fa128d7ab61f8350b41960f07263 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -37,7 +37,7 @@ struct CastOpFunctor {
       : in_(in), out_(out), ctx_(ctx) {}
 
   template <typename OutT>
-  void operator()() const {
+  void apply() const {
     auto* in_begin = in_->data<InT>();
     auto numel = in_->numel();
     auto* in_end = in_begin + numel;
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..0dee1781623d5a62830545c0952e5aadbe37accb
--- /dev/null
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * transform that computes target bounding-box regression deltas
+ * given proposal boxes and ground-truth boxes.
+ */
+template <typename T>
+inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes,
+                       const framework::Tensor& gt_boxes, const T* weights,
+                       const bool normalized, framework::Tensor* box_delta) {
+  auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
+  auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
+  auto trg = framework::EigenTensor<T, 2>::From(*box_delta);
+  T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
+  for (int64_t i = 0; i < box_num; ++i) {
+    ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + (normalized == false);
+    ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + (normalized == false);
+    ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
+    ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
+
+    gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + (normalized == false);
+    gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + (normalized == false);
+    gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
+    gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
+
+    trg(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w;
+    trg(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h;
+    trg(i, 2) = std::log(gt_w / ex_w);
+    trg(i, 3) = std::log(gt_h / ex_h);
+
+    if (weights) {
+      trg(i, 0) = trg(i, 0) / weights[0];
+      trg(i, 1) = trg(i, 1) / weights[1];
+      trg(i, 2) = trg(i, 2) / weights[2];
+      trg(i, 3) = trg(i, 3) / weights[3];
+    }
+  }
+}
+
+template <typename T>
+void Gather(const T* in, const int in_stride, const int* index, const int num,
+            T* out) {
+  const int stride_bytes = in_stride * sizeof(T);
+  for (int i = 0; i < num; ++i) {
+    int id = index[i];
+    memcpy(out + i * in_stride, in + id * in_stride, stride_bytes);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 0571c46f6be99c9a06b7dd2abb310eeda506ecd5..be06dc19743cfa6f093bcb3f4e9f91af315d4211 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -133,31 +134,6 @@ void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes,
   }
 }
 
-template <typename T>
-void BoxToDelta(int box_num, const Tensor& ex_boxes, const Tensor& gt_boxes,
-                const std::vector<float>& weights, Tensor* box_delta) {
-  auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
-  auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
-  auto box_delta_et = framework::EigenTensor<T, 2>::From(*box_delta);
-  T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
-  for (int64_t i = 0; i < box_num; ++i) {
-    ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + 1;
-    ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + 1;
-    ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
-    ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
-
-    gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + 1;
-    gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + 1;
-    gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
-    gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
-
-    box_delta_et(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0];
-    box_delta_et(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1];
-    box_delta_et(i, 2) = log(gt_w / ex_w) / ex_w / weights[2];
-    box_delta_et(i, 3) = log(gt_h / ex_h) / ex_h / weights[3];
-  }
-}
-
 template <typename T>
 std::vector<std::vector<int>> SampleFgBgGt(
     const platform::CPUDeviceContext& context, Tensor* iou,
@@ -243,12 +219,11 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
                        Tensor* sampled_labels, Tensor* sampled_gts) {
   int fg_num = fg_inds.size();
   int bg_num = bg_inds.size();
-  int gt_num = fg_num + bg_num;
   Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
   int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
   int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace());
   int* gt_box_inds_data =
-      gt_box_inds_t.mutable_data<int>({gt_num}, context.GetPlace());
+      gt_box_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
   int* gt_label_inds_data =
       gt_label_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
   std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data);
@@ -303,18 +278,20 @@ std::vector<Tensor> SampleRoisForOneImage(
 
   // Gather boxes and labels
   Tensor sampled_boxes, sampled_labels, sampled_gts;
-  int boxes_num = fg_inds.size() + bg_inds.size();
+  int fg_num = fg_inds.size();
+  int bg_num = bg_inds.size();
+  int boxes_num = fg_num + bg_num;
   framework::DDim bbox_dim({boxes_num, kBoxDim});
   sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
   sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
-  sampled_gts.mutable_data<T>(bbox_dim, context.GetPlace());
+  sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
   GatherBoxesLabels<T>(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds,
                        gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts);
 
   // Compute targets
   Tensor bbox_targets_single;
   bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
-  BoxToDelta<T>(boxes_num, sampled_boxes, sampled_gts, bbox_reg_weights,
+  BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, nullptr, false,
                 &bbox_targets_single);
 
   // Scale rois
@@ -427,7 +404,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
     auto rpn_rois_lod = rpn_rois->lod().back();
     auto gt_classes_lod = gt_classes->lod().back();
     auto gt_boxes_lod = gt_boxes->lod().back();
-    for (size_t i = 0; i < n; ++i) {
+    for (int i = 0; i < n; ++i) {
       Tensor rpn_rois_slice =
           rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
       Tensor gt_classes_slice =
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index d29b0153389574de8992b93ac6795e91556af870..ebe6830eccd87a156768eb0d4b96220bcc9f4edc 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -33,7 +33,7 @@ struct AppendProposalsFunctor {
       : out_(out), offset_(offset), to_add_(to_add) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     auto *out_data = out_->data<T>();
     auto *to_add_data = to_add_->data<T>();
     memcpy(out_data + offset_, to_add_data, to_add_->numel() * sizeof(T));
@@ -311,8 +311,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 
     rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
                               context.GetPlace());
-    rpn_roi_probs->mutable_data<T>({scores->numel() / 4, 1},
-                                   context.GetPlace());
+    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
 
     Tensor bbox_deltas_swap, scores_swap;
     bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
@@ -421,7 +420,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     CPUGather<T>(ctx, proposals, keep, &bbox_sel);
     CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
     if (nms_thresh <= 0) {
-      return std::make_pair(bbox_sel, scores_sel);
+      return std::make_pair(bbox_sel, scores_filter);
     }
 
     Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index 177ff7cf187bc9daf69889e99ca57ae18766de90..88757f25cd9a5789758640de2d9cae0b12350b25 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <random>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
@@ -46,156 +47,219 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
     auto in_dims = ctx->GetInputDim("DistMat");
     PADDLE_ENFORCE_EQ(in_dims.size(), 2,
                       "The rank of Input(DistMat) must be 2.");
+
+    ctx->SetOutputDim("LocationIndex", {-1});
+    ctx->SetOutputDim("ScoreIndex", {-1});
+    ctx->SetOutputDim("TargetLabel", {-1, 1});
+    ctx->SetOutputDim("TargetBBox", {-1, 4});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("DistMat")->type()),
+        platform::CPUPlace());
   }
 };
 
 template <typename T>
 class RpnTargetAssignKernel : public framework::OpKernel<T> {
  public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* anchor_t = context.Input<Tensor>("Anchor");  // (H*W*A) * 4
+    auto* gt_bbox_t = context.Input<Tensor>("GtBox");
+    auto* dist_t = context.Input<LoDTensor>("DistMat");
+
+    auto* loc_index_t = context.Output<Tensor>("LocationIndex");
+    auto* score_index_t = context.Output<Tensor>("ScoreIndex");
+    auto* tgt_bbox_t = context.Output<Tensor>("TargetBBox");
+    auto* tgt_lbl_t = context.Output<Tensor>("TargetLabel");
+
+    auto lod = dist_t->lod().back();
+    int64_t batch_num = static_cast<int64_t>(lod.size() - 1);
+    int64_t anchor_num = dist_t->dims()[1];
+    PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]);
+
+    int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
+    float pos_threshold = context.Attr<float>("rpn_positive_overlap");
+    float neg_threshold = context.Attr<float>("rpn_negative_overlap");
+    float fg_fraction = context.Attr<float>("fg_fraction");
+
+    int fg_num_per_batch = static_cast<int>(rpn_batch_size * fg_fraction);
+
+    int64_t max_num = batch_num * anchor_num;
+    auto place = context.GetPlace();
+
+    tgt_bbox_t->mutable_data<T>({max_num, 4}, place);
+    auto* loc_index = loc_index_t->mutable_data<int>({max_num}, place);
+    auto* score_index = score_index_t->mutable_data<int>({max_num}, place);
+
+    Tensor tmp_tgt_lbl;
+    auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data<int64_t>({max_num}, place);
+    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
+    iset(dev_ctx, &tmp_tgt_lbl, static_cast<int64_t>(-1));
+
+    std::random_device rnd;
+    std::minstd_rand engine;
+    int seed =
+        context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+    engine.seed(seed);
+
+    int fg_num = 0;
+    int bg_num = 0;
+    for (int i = 0; i < batch_num; ++i) {
+      Tensor dist = dist_t->Slice(lod[i], lod[i + 1]);
+      Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]);
+      auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold,
+                                   rpn_batch_size, fg_num_per_batch, engine,
+                                   tmp_lbl_data + i * anchor_num);
+
+      int cur_fg_num = fg_bg_gt[0].size();
+      int cur_bg_num = fg_bg_gt[1].size();
+      std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index,
+                     [i, anchor_num](int d) { return d + i * anchor_num; });
+      memcpy(score_index, loc_index, cur_fg_num * sizeof(int));
+      std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(),
+                     score_index + cur_fg_num,
+                     [i, anchor_num](int d) { return d + i * anchor_num; });
+
+      // get target bbox deltas
+      if (cur_fg_num) {
+        Tensor fg_gt;
+        T* gt_data = fg_gt.mutable_data<T>({cur_fg_num, 4}, place);
+        Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num);
+        T* tgt_data = tgt_bbox.data<T>();
+        Gather<T>(anchor_t->data<T>(), 4,
+                  reinterpret_cast<int*>(&fg_bg_gt[0][0]), cur_fg_num,
+                  tgt_data);
+        Gather<T>(gt_bbox.data<T>(), 4, reinterpret_cast<int*>(&fg_bg_gt[2][0]),
+                  cur_fg_num, gt_data);
+        BoxToDelta<T>(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox);
+      }
+
+      loc_index += cur_fg_num;
+      score_index += cur_fg_num + cur_bg_num;
+      fg_num += cur_fg_num;
+      bg_num += cur_bg_num;
+    }
+
+    int lbl_num = fg_num + bg_num;
+    PADDLE_ENFORCE_LE(fg_num, max_num);
+    PADDLE_ENFORCE_LE(lbl_num, max_num);
+
+    tgt_bbox_t->Resize({fg_num, 4});
+    loc_index_t->Resize({fg_num});
+    score_index_t->Resize({lbl_num});
+    auto* lbl_data = tgt_lbl_t->mutable_data<int64_t>({lbl_num, 1}, place);
+    Gather<int64_t>(tmp_lbl_data, 1, score_index_t->data<int>(), lbl_num,
+                    lbl_data);
+  }
+
+ private:
   void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max,
                    const int row, const int col, const float pos_threshold,
-                   const float neg_threshold, int64_t* target_label_data,
+                   const float neg_threshold, int64_t* target_label,
                    std::vector<int>* fg_inds, std::vector<int>* bg_inds) const {
-    int fg_offset = fg_inds->size();
-    int bg_offset = bg_inds->size();
+    float epsilon = 0.0001;
     for (int64_t i = 0; i < row; ++i) {
       const T* v = dist_data + i * col;
-      T max_dist = *std::max_element(v, v + col);
+      T max = *std::max_element(v, v + col);
       for (int64_t j = 0; j < col; ++j) {
-        T val = dist_data[i * col + j];
-        if (val == max_dist) target_label_data[j] = 1;
+        if (std::abs(max - v[j]) < epsilon) {
+          target_label[j] = 1;
+        }
       }
     }
 
-    // Pick the fg/bg and count the number
+    // Pick the fg/bg
+    const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
     for (int64_t j = 0; j < col; ++j) {
-      if (anchor_to_gt_max.data<T>()[j] > pos_threshold) {
-        target_label_data[j] = 1;
-      } else if (anchor_to_gt_max.data<T>()[j] < neg_threshold) {
-        target_label_data[j] = 0;
+      if (anchor_to_gt_max_data[j] >= pos_threshold) {
+        target_label[j] = 1;
+      } else if (anchor_to_gt_max_data[j] < neg_threshold) {
+        target_label[j] = 0;
       }
-      if (target_label_data[j] == 1) {
-        fg_inds->push_back(fg_offset + j);
-      } else if (target_label_data[j] == 0) {
-        bg_inds->push_back(bg_offset + j);
+      if (target_label[j] == 1) {
+        fg_inds->push_back(j);
+      } else if (target_label[j] == 0) {
+        bg_inds->push_back(j);
       }
     }
   }
 
-  void ReservoirSampling(const int num, const int offset,
-                         std::minstd_rand engine,
+  void ReservoirSampling(const int num, std::minstd_rand engine,
                          std::vector<int>* inds) const {
     std::uniform_real_distribution<float> uniform(0, 1);
-    const int64_t size = static_cast<int64_t>(inds->size() - offset);
-    if (size > num) {
-      for (int64_t i = num; i < size; ++i) {
+    size_t len = inds->size();
+    if (len > static_cast<size_t>(num)) {
+      for (size_t i = num; i < len; ++i) {
         int rng_ind = std::floor(uniform(engine) * i);
         if (rng_ind < num)
-          std::iter_swap(inds->begin() + rng_ind + offset,
-                         inds->begin() + i + offset);
+          std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
       }
+      inds->resize(num);
     }
   }
 
-  void RpnTargetAssign(const framework::ExecutionContext& ctx,
-                       const Tensor& dist, const float pos_threshold,
-                       const float neg_threshold, const int rpn_batch_size,
-                       const int fg_num, std::minstd_rand engine,
-                       std::vector<int>* fg_inds, std::vector<int>* bg_inds,
-                       int64_t* target_label_data) const {
+  // std::vector<std::vector<int>> RpnTargetAssign(
+  std::vector<std::vector<int>> SampleFgBgGt(
+      const platform::CPUDeviceContext& ctx, const Tensor& dist,
+      const float pos_threshold, const float neg_threshold,
+      const int rpn_batch_size, const int fg_num, std::minstd_rand engine,
+      int64_t* target_label) const {
     auto* dist_data = dist.data<T>();
-    int64_t row = dist.dims()[0];
-    int64_t col = dist.dims()[1];
-    int fg_offset = fg_inds->size();
-    int bg_offset = bg_inds->size();
+    int row = dist.dims()[0];
+    int col = dist.dims()[1];
+
+    std::vector<int> fg_inds;
+    std::vector<int> bg_inds;
+    std::vector<int> gt_inds;
 
     // Calculate the max IoU between anchors and gt boxes
-    Tensor anchor_to_gt_max;
-    anchor_to_gt_max.mutable_data<T>(
-        framework::make_ddim({static_cast<int64_t>(col), 1}),
-        platform::CPUPlace());
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    auto x = EigenMatrix<T>::From(dist);
-    auto x_col_max = EigenMatrix<T>::From(anchor_to_gt_max);
-    x_col_max.device(place) =
-        x.maximum(Eigen::DSizes<int, 1>(0))
-            .reshape(Eigen::DSizes<int, 2>(static_cast<int64_t>(col), 1));
+    // Map from anchor to gt box that has highest overlap
+    auto place = ctx.GetPlace();
+    Tensor anchor_to_gt_max, anchor_to_gt_argmax;
+    anchor_to_gt_max.mutable_data<T>({col}, place);
+    int* argmax = anchor_to_gt_argmax.mutable_data<int>({col}, place);
+
+    auto x = framework::EigenMatrix<T>::From(dist);
+    auto x_col_max = framework::EigenVector<T>::Flatten(anchor_to_gt_max);
+    auto x_col_argmax =
+        framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
+    x_col_max = x.maximum(Eigen::DSizes<int, 1>(0));
+    x_col_argmax = x.argmax(0).template cast<int>();
+
     // Follow the Faster RCNN's implementation
     ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold,
-                neg_threshold, target_label_data, fg_inds, bg_inds);
+                neg_threshold, target_label, &fg_inds, &bg_inds);
     // Reservoir Sampling
-    ReservoirSampling(fg_num, fg_offset, engine, fg_inds);
-    int bg_num = rpn_batch_size - (fg_inds->size() - fg_offset);
-    ReservoirSampling(bg_num, bg_offset, engine, bg_inds);
-  }
+    ReservoirSampling(fg_num, engine, &fg_inds);
+    int fg_num2 = static_cast<int>(fg_inds.size());
+    int bg_num = rpn_batch_size - fg_num2;
+    ReservoirSampling(bg_num, engine, &bg_inds);
 
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* dist = context.Input<LoDTensor>("DistMat");
-    auto* loc_index = context.Output<Tensor>("LocationIndex");
-    auto* score_index = context.Output<Tensor>("ScoreIndex");
-    auto* tgt_lbl = context.Output<Tensor>("TargetLabel");
-
-    auto col = dist->dims()[1];
-    int64_t n = dist->lod().size() == 0UL
-                    ? 1
-                    : static_cast<int64_t>(dist->lod().back().size() - 1);
-    if (dist->lod().size()) {
-      PADDLE_ENFORCE_EQ(dist->lod().size(), 1UL,
-                        "Only support 1 level of LoD.");
+    gt_inds.reserve(fg_num2);
+    for (int i = 0; i < fg_num2; ++i) {
+      gt_inds.emplace_back(argmax[fg_inds[i]]);
     }
-    int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
-    float pos_threshold = context.Attr<float>("rpn_positive_overlap");
-    float neg_threshold = context.Attr<float>("rpn_negative_overlap");
-    float fg_fraction = context.Attr<float>("fg_fraction");
-
-    int fg_num = static_cast<int>(rpn_batch_size * fg_fraction);
-
-    int64_t* target_label_data =
-        tgt_lbl->mutable_data<int64_t>({n * col, 1}, context.GetPlace());
+    std::vector<std::vector<int>> fg_bg_gt;
+    fg_bg_gt.emplace_back(fg_inds);
+    fg_bg_gt.emplace_back(bg_inds);
+    fg_bg_gt.emplace_back(gt_inds);
 
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
-    iset(dev_ctx, tgt_lbl, static_cast<int>(-1));
-
-    std::vector<int> fg_inds;
-    std::vector<int> bg_inds;
-    std::random_device rnd;
-    std::minstd_rand engine;
-    int seed =
-        context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
-    engine.seed(seed);
-
-    if (n == 1) {
-      RpnTargetAssign(context, *dist, pos_threshold, neg_threshold,
-                      rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
-                      target_label_data);
-    } else {
-      auto lod = dist->lod().back();
-      for (size_t i = 0; i < lod.size() - 1; ++i) {
-        Tensor one_ins = dist->Slice(lod[i], lod[i + 1]);
-        RpnTargetAssign(context, one_ins, pos_threshold, neg_threshold,
-                        rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
-                        target_label_data + i * col);
-      }
-    }
-    int* loc_index_data = loc_index->mutable_data<int>(
-        {static_cast<int>(fg_inds.size())}, context.GetPlace());
-    int* score_index_data = score_index->mutable_data<int>(
-        {static_cast<int>(fg_inds.size() + bg_inds.size())},
-        context.GetPlace());
-    memcpy(loc_index_data, reinterpret_cast<int*>(&fg_inds[0]),
-           fg_inds.size() * sizeof(int));
-    memcpy(score_index_data, reinterpret_cast<int*>(&fg_inds[0]),
-           fg_inds.size() * sizeof(int));
-    memcpy(score_index_data + fg_inds.size(),
-           reinterpret_cast<int*>(&bg_inds[0]), bg_inds.size() * sizeof(int));
+    return fg_bg_gt;
   }
 };
 
 class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
+    AddInput("Anchor",
+             "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
+    AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4].");
     AddInput(
         "DistMat",
         "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
@@ -241,12 +305,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
         "ScoreIndex",
         "(Tensor), The indexes of foreground and background anchors in all "
         "RPN anchors(The rest anchors are ignored). The shape of the "
-        "ScoreIndex is [F + B], F and B depend on the value of input "
-        "tensor and attributes.");
-    AddOutput("TargetLabel",
-              "(Tensor<int64_t>), The target labels of each anchor with shape "
-              "[K * M, 1], "
-              "K and M is the same as they are in DistMat.");
+        "ScoreIndex is [F + B], F and B are sampled foreground and backgroud "
+        " number.");
+    AddOutput("TargetBBox",
+              "(Tensor<int64_t>), The target bbox deltas with shape "
+              "[F, 4], F is the sampled foreground number.");
+    AddOutput(
+        "TargetLabel",
+        "(Tensor<int64_t>), The target labels of each anchor with shape "
+        "[F + B, 1], F and B are sampled foreground and backgroud number.");
     AddComment(R"DOC(
 This operator can be, for given the IoU between the ground truth bboxes and the
 anchors, to assign classification and regression targets to each prediction.
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 66784f0b5149a7c479a90a407709d993f4a40a8b..31159a02592a2aff75f7ecf5be924989f0f47071 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -39,19 +39,6 @@ bool RequestSendHandler::Handle(const std::string& varname,
                                 const std::string& out_var_name) {
   VLOG(4) << "RequestSendHandler:" << varname;
 
-  // Async
-  if (!sync_mode_) {
-    rpc_server_->Profiler().OneStep();
-    try {
-      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
-                                    scope);
-    } catch (std::exception& e) {
-      LOG(ERROR) << "async: run sub program error " << e.what();
-      return false;
-    }
-    return true;
-  }
-
   // Sync
   if (varname == BATCH_BARRIER_MESSAGE) {
     VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
@@ -60,17 +47,31 @@ bool RequestSendHandler::Handle(const std::string& varname,
     VLOG(3) << "sync: recv complete message";
     rpc_server_->Complete();
   } else {
-    VLOG(3) << "sync: received var_name: " << varname;
-    rpc_server_->WaitCond(kRequestSend);
-    VLOG(3) << "sync: processing received var: " << varname;
-
-    if (invar == nullptr) {
-      LOG(FATAL) << "sync: Can not find server side var: " << varname;
-      return false;
-    }
-    if (invar->IsType<framework::SelectedRows>()) {
-      std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
-      sparse_vars_.push_back(invar);
+    // Async
+    if (!sync_mode_) {
+      VLOG(3) << "async process var: " << varname;
+      rpc_server_->Profiler().OneStep();
+      try {
+        executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
+                                      scope);
+      } catch (std::exception& e) {
+        LOG(ERROR) << "async: run sub program error " << e.what();
+        return false;
+      }
+      return true;
+    } else {  // sync
+      rpc_server_->WaitCond(kRequestSend);
+      VLOG(3) << "sync: processing received var: " << varname;
+
+      if (invar == nullptr) {
+        LOG(FATAL) << "sync: Can not find server side var: " << varname;
+        return false;
+      }
+
+      if (invar->IsType<framework::SelectedRows>()) {
+        std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+        sparse_vars_.push_back(invar);
+      }
     }
   }
   return true;
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index d5b9b2dac085e7abc31ef243be82eaa815d387ba..b1a399c22c2b9ed7464a1b1764478803d4416d94 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <algorithm>
+#include <iterator>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -94,8 +95,11 @@ class RowwiseTransformIterator;
 template <typename T, typename DeviceContext>
 class MidWiseTransformIterator;
 
+// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17
 template <typename T>
-class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
+class RowwiseTransformIterator<T, platform::CPUDeviceContext>
+    : public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
+                           T *, T &> {
  public:
   RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
 
@@ -126,7 +130,9 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
 };
 
 template <typename T>
-class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
+class MidWiseTransformIterator<T, platform::CPUDeviceContext>
+    : public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
+                           T *, T &> {
  public:
   MidWiseTransformIterator(const T *ptr, int n, int post)
       : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
@@ -479,8 +485,13 @@ void ElemwiseGradComputeNoBroadcast(
     const framework::Tensor &dout, int axis, framework::Tensor *dx,
     framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
   size_t N = static_cast<size_t>(framework::product(x_dim));
+#if !defined(_WIN32)
   platform::ForRange<DeviceContext> for_range(
       ctx.template device_context<DeviceContext>(), N);
+#else
+  platform::ForRange<DeviceContext> for_range(
+      ctx.device_context<DeviceContext>(), N);
+#endif  // !_WIN32
   for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
       x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
       dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
@@ -633,13 +644,13 @@ void ElementwiseGradCompute(const framework::ExecutionContext &ctx,
 
 template <typename Functor, typename DeviceContext, typename T,
           typename OutType = T>
+
 void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
                           const framework::Tensor *x,
                           const framework::Tensor *y, int axis, Functor func,
                           framework::Tensor *z) {
   TransformFunctor<Functor, T, DeviceContext, OutType> functor(
       x, y, z, ctx.template device_context<DeviceContext>(), func);
-
   auto x_dims = x->dims();
   auto y_dims_untrimed = y->dims();
   PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index a91e0f520e93c01bc5af09b691af2d5a6deda9f2..e608eba05d5680254835f7b25f53d6a59e310e2a 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -14,86 +14,198 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fake_quantize_op.h"
 #include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
 
-class FakeQuantizeOp : public framework::OperatorWithKernel {
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVectorArrayMap =
+    Eigen::TensorMap<Eigen::Tensor<T, 1, MajorType, IndexType>>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using ConstEigenVectorArrayMap =
+    Eigen::TensorMap<const Eigen::Tensor<T, 1, MajorType, IndexType>>;
+
+template <typename T>
+struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
+                  const int num, T* out) {
+    Eigen::DSizes<Eigen::DenseIndex, 1> idim(num);
+    Eigen::DSizes<Eigen::DenseIndex, 1> odim(1);
+    Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>> in_e(in, idim);
+    Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>> out_e(out, odim);
+
+    out_e = in_e.abs().maximum();
+  }
+};
+
+template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
+
+template <typename T>
+struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    T s = scale.data<T>()[0];
+    platform::Transform<platform::CPUDeviceContext> trans;
+    trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
+          out->mutable_data<T>(ctx.GetPlace()), ClipFunctor<T>(-s, s));
+    auto in_e = framework::EigenVector<T>::Flatten(in);
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+
+    out_e.device(*ctx.eigen_device()) = (bin_cnt / s * in_e).round();
+  }
+};
+
+template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
+
+template <typename T>
+struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& cur_scale,
+                  const framework::Tensor& last_scale,
+                  const framework::Tensor& iter, const int window_size,
+                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
+    T* scale_arr = scales_arr->mutable_data<T>(ctx.GetPlace());
+    int64_t it = iter.data<int64_t>()[0];
+    int idx = it % window_size;
+    T removed = scale_arr[idx];
+    T cur = cur_scale.data<T>()[0];
+    scale_arr[idx] = cur;
+
+    T max = last_scale.data<T>()[0];
+    if (max < cur) {
+      max = cur;
+    } else if (fabs(removed - max) < 1e-6) {
+      int size = (it > window_size) ? window_size : it;
+      FindAbsMaxFunctor<platform::CPUDeviceContext, T>()(ctx, scale_arr, size,
+                                                         &max);
+    }
+    out_scale->mutable_data<T>(ctx.GetPlace())[0] = max;
+  }
+};
+
+template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
+
+class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
  public:
-  FakeQuantizeOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
+  FakeQuantizeAbsMaxOp(const std::string& type,
+                       const framework::VariableNameMap& inputs,
+                       const framework::VariableNameMap& outputs,
+                       const framework::AttributeMap& attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of FakeQuantizeOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FakeQuantizeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("OutMovingScale"),
-                   "OutMovingScale(Out) of FakeQuantizeOp should not be null");
-    // if (ctx->HasInput("InMovingScale")) {
-    ctx->SetOutputDim("OutMovingScale", ctx->GetInputDim("InMovingScale"));
-    //}
-    // if (ctx->HasInput("InScales")) {
-    PADDLE_ENFORCE(ctx->HasOutput("OutScales"),
-                   "OutScales(Out) of FakeQuantizeOp should not be null");
-    ctx->SetOutputDim("OutScales", ctx->GetInputDim("InScales"));
-    // PADDLE_ENFORCE_EQ(ctx->Inputs("InScales")[0],
-    // ctx->Outputs("OutScales")[0],
-    //                  "Mean and MeanOut should share the same memory");
-    //}
+    PADDLE_ENFORCE(ctx->HasOutput("OutScale"),
+                   "Output(Scale) of FakeQuantizeOp should not be null.");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {1});
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
-class FakeQuantizeOpMaker : public framework::OpProtoAndCheckerMaker {
+class FakeQuantizeAbsMaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(Tensor) Input tensor of scale operator.");
-    AddInput("InScales", "(Tensor) scale buffer, used in static quantization.")
-        .AsDispensable();
-    AddInput("InMovingScale", "Last scale, used in static quantization.")
-        .AsDispensable();
-    AddInput("InCurrentIter",
-             "Last iteration number, used in static quantization.")
-        .AsDispensable();
-    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
-    AddOutput("OutScales",
-              "(Tensor) scale buffer, used in static quantization.")
-        .AsDispensable();
-    AddOutput("OutMovingScale", " Current scale");
-    AddOutput("OutCurrentIter", "Current iteration number.").AsDispensable();
-    AddAttr<std::string>("quantize_type",
-                         "(string, default abs_max)"
-                         "The scaling tpe of the quantize operator.")
-        .SetDefault("abs_max");
-    AddAttr<int>("window_size", "(int, default 10000)").SetDefault(10000);
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddOutput("Out",
+              "(Tensor) Output of quantized low level tensor, "
+              "but also saved as float data type.");
+    AddOutput("OutScale", "(Tensor) Current scale");
     AddAttr<int>("bit_length", "(int, default 8)")
         .SetDefault(8)
-        .AddCustomChecker([](const int &bit_length) {
+        .AddCustomChecker([](const int& bit_length) {
           PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
                          "'bit_length' should be between 1 and 16.");
         });
-    AddAttr<bool>("is_test", "").SetDefault(false);
     AddComment(R"DOC(
 FakeQuantize operator
 
-quantize_type = abs_max:
+$$scale = max(abs(X))$$ 
+$$range = 2^{bit_length - 1} - 1$$
+$$Out = round(X/scale * range)$$
 
-    $$scale = max(abs(x))$$ 
+)DOC");
+  }
+};
 
-quantize_type = range_abs_max:
+class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
+ public:
+  FakeQuantizeRangeAbsMaxOp(const std::string& type,
+                            const framework::VariableNameMap& inputs,
+                            const framework::VariableNameMap& outputs,
+                            const framework::AttributeMap& attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-    $$scale = max(max(abs(x)), history_abs_max)$$ 
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeQuantizeRangeAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FakeQuantizeRangeAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("OutScale"),
+        "Output(OutScale) of FakeQuantizeRangeAbsMaxOp should not be null");
+    if (ctx->HasOutput("OutScales")) {
+      int window_size = ctx->Attrs().Get<int>("window_size");
+      ctx->SetOutputDim("OutScales", {window_size});
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {1});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
 
-quantize_type = moving_average_abs_max:
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
 
-    $$scale = 0.1*scale+0.9*new_abs_max)$$ 
+class FakeQuantizeRangeAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddInput("InScale", "Last scale.");
+    AddInput("Iter", "Global step iteration.").AsDispensable();
+    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
+    AddOutput("OutScale", " Current scale");
+    AddOutput("OutScales", "(Tensor) scale buffer.").AsDispensable();
+    AddAttr<int>("window_size", "(int, default 10000) window range size.")
+        .SetDefault(10000);
+    AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
+                         "'bit_length' should be between 1 and 16.");
+        });
+    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddComment(R"DOC(
+FakeQuantize operator is used in static quantization.
 
-$$Out = scale*X$$
+$$scale = max(max(abs(x)), history_abs_max)$$ 
+$$range = 2^{bit_length - 1} - 1$$
+$$Out = round(X/scale * range)$$
 
 )DOC");
   }
@@ -103,10 +215,16 @@ $$Out = scale*X$$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxOp,
+                  ops::FakeQuantizeAbsMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
+                       ops::FakeQuantizeAbsMaxKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_quantize, ops::FakeQuantizeOp, ops::FakeQuantizeOpMaker,
+REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
+                  ops::FakeQuantizeRangeAbsMaxOpMaker,
                   paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fake_quantize,
-    ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
+                       ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index be0c6730a5119090600a27c66510b2a095c54583..a0ff6396210c2b3a7f8bd6b9f274b875d7fd4933 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/fake_quantize_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -20,7 +21,7 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
+__global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
   int tid = threadIdx.x;
 
@@ -43,7 +44,7 @@ __global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
   __syncthreads();
 
   for (int i = blockDim.x / 2; i > 0; i >>= 1) {
-    if (tid < i && shared_max_data[tid] < shared_max_data[tid + i]) {
+    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
       shared_max_data[tid] = shared_max_data[tid + i];
     }
     __syncthreads();
@@ -53,220 +54,125 @@ __global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
   }
 }
 
-float FindAbsMaxGpu(const platform::CUDADeviceContext& ctx, const float* array,
-                    int length) {
-  float host_max;
-  int kNumTheads = 1024;
-  int gridDimx = (kNumTheads - 1 + length) / kNumTheads;
-  gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
-  framework::Tensor t;
-  float* device_max = t.mutable_data<float>(framework::make_ddim({gridDimx}),
-                                            platform::CUDAPlace());
-  FindAbsMaxKernel<float><<<gridDimx, kNumTheads, kNumTheads * sizeof(float),
-                            ctx.stream()>>>(length, array, device_max);
-  FindAbsMaxKernel<
-      float><<<1, kNumTheads, kNumTheads * sizeof(float), ctx.stream()>>>(
-      gridDimx, device_max, device_max);
-  PADDLE_ENFORCE_EQ(
-      cudaMemcpy(&host_max, device_max, sizeof(float), cudaMemcpyDeviceToHost),
-      cudaSuccess, "cudaMemcpy failed");
-  return host_max;
-}
+template <typename T>
+struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
+                  const int num, T* out) {
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    grid = (grid > block) ? block : grid;
+
+    framework::Tensor max;
+    T* max_data =
+        max.mutable_data<T>(framework::make_ddim({grid}), ctx.GetPlace());
+    FindAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
+        in, num, max_data);
+    FindAbsMaxKernel<T><<<1, block, 1024 * sizeof(T), ctx.stream()>>>(
+        max_data, grid, out);
+  }
+};
+
+template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
 template <typename T>
-__global__ void ApplySaturateKernel(const int n, const T* in, T* out,
-                                    int* num_saturate, const T min,
-                                    const T max) {
+__global__ void ClipAndQuantKernel(const T* in, const T* scale,
+                                   const int bin_cnt, const int n, T* out) {
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
   int tid = threadIdx.x;
 
-  extern __shared__ int shared_count[];
-  shared_count[tid] = 0;
+  T s = scale[0];
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    if (in[i] > max) {
-      out[i] = max;
-      shared_count[tid] += 1;
-    } else if (in[i] < min) {
-      out[i] = min;
-      shared_count[tid] += 1;
-    } else {
-      out[i] = in[i];
-    }
-  }
-  __syncthreads();
-
-  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
-    if (tid < i) {
-      shared_count[tid] += shared_count[tid + i];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    num_saturate[blockIdx.x] = shared_count[0];
+    T x = in[bid];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt / s * v;
+    out[bid] = round(v);
   }
 }
 
 template <typename T>
-__global__ void ReduceKernel(const int n, const T* in, T* out) {
-  int tid = threadIdx.x;
-  extern __shared__ T shared_sum[];
-  if (tid < n) {
-    shared_sum[tid] = in[tid];
+__global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
+                                            const T* last_scale,
+                                            const int64_t* iter,
+                                            const int window_size, T* scale_arr,
+                                            T* out_scale, int* need_find_max,
+                                            int* out_size) {
+  int it = iter[0];
+  int idx = it % window_size;
+  T removed = scale_arr[idx];
+  T cur = cur_scale[0];
+  scale_arr[idx] = cur;
+  T max = last_scale[0];
+  out_scale[0] = max < cur ? cur : max;
+  if (fabs(removed - max) < 1e-6) {
+    need_find_max[0] = 1;
+    out_size[0] = it > window_size ? window_size : it;
   } else {
-    shared_sum[tid] = T(0);
-  }
-  __syncthreads();
-  // blockDim.x must >= n
-  for (int i = (n + 1) / 2; i > 0; i >>= 1) {
-    if (tid < i) {
-      shared_sum[tid] += shared_sum[tid + i];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    out[0] = shared_sum[0];
+    need_find_max[0] = 0;
   }
 }
 
 template <typename T>
-int ApplySaturateGpu(const platform::CUDADeviceContext& ctx, const int n,
-                     const T* in, T* out, const T min, const T max) {
-  int host_num_saturate;
-  int kNumTheads = 1024;
-  int gridDimx = (n + kNumTheads - 1) / kNumTheads;
-  gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
-  framework::Tensor t;
-  int* device_num_saturate = t.mutable_data<int>(
-      framework::make_ddim({gridDimx}), platform::CUDAPlace());
-  ApplySaturateKernel<
-      T><<<gridDimx, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
-      n, in, out, device_num_saturate, min, max);
-  ReduceKernel<int><<<1, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
-      gridDimx, device_num_saturate, device_num_saturate);
-  PADDLE_ENFORCE_EQ(cudaSuccess,
-                    cudaMemcpy(&host_num_saturate, device_num_saturate,
-                               sizeof(int), cudaMemcpyDeviceToHost),
-                    "cudaMemcpy failed");
-  return host_num_saturate;
-}
-
-template <typename DeviceContext, typename T>
-class FakeQuantizeCUDAKernel : public framework::OpKernel<T> {
- public:
-  T FindRangeAbsMax(const platform::CUDADeviceContext& ctx,
-                    framework::Tensor* scale_list, framework::Tensor* out_scale,
-                    const T& cur_scale, int window_size,
-                    int current_iter) const {
-    T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
-    T remove_tmp = sl[current_iter];
-    sl[current_iter] = cur_scale;
-    T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
-    if (max_scale < cur_scale) {
-      max_scale = cur_scale;
-    } else if (fabs(remove_tmp - max_scale) < 1e-6) {
-      int size = (current_iter > window_size) ? window_size : current_iter;
-      max_scale = T(FindAbsMaxGpu(ctx, scale_list->data<float>(), size));
+struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& cur_scale,
+                  const framework::Tensor& last_scale,
+                  const framework::Tensor& iter, const int window_size,
+                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
+    const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+
+    T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
+    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
+
+    framework::Tensor need_find_max, out_size;
+    int* find_max = need_find_max.mutable_data<int>(gpu_place);
+    int* out_size_data = out_size.mutable_data<int>(gpu_place);
+
+    FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
+        cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
+        window_size, scale_arr, out_scale_data, find_max, out_size_data);
+
+    int g_find_max;
+    memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
+                 sizeof(int), 0);
+    if (g_find_max) {
+      int len;
+      memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
+                   sizeof(int), 0);
+      FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
+                                                          out_scale_data);
     }
-    return max_scale;
-  }
-
-  T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
-                             framework::Tensor* out_scale,
-                             const T& cur_scale) const {
-    T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
-    T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
-    outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
-    return T(outs[0]);
   }
+};
 
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto& device_ctx = context.cuda_device_context();
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    const bool is_test = context.Attr<bool>("is_test");
-    tensor->mutable_data<T>(in->place());
-    context.Output<framework::Tensor>("OutMovingScale")
-        ->mutable_data<T>(
-            context.Input<framework::Tensor>("InMovingScale")->place());
-    auto quantize_type =
-        static_cast<std::string>(context.Attr<std::string>("quantize_type"));
-    if (quantize_type == std::string("range_abs_max")) {
-      context.Output<framework::Tensor>("OutScales")
-          ->mutable_data<T>(
-              context.Input<framework::Tensor>("InScales")->place());
-      context.Output<framework::Tensor>("OutCurrentIter")
-          ->mutable_data<T>(
-              context.Input<framework::Tensor>("InCurrentIter")->place());
-    }
-
-    T scale = T(1);
-    int window_size = context.Attr<int>("window_size");
-    T bin_cnt = (T)((1 << (context.Attr<int>("bit_length") - 1)) - 1);
-    if (quantize_type == std::string("abs_max")) {
-      auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
-      scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
-      saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
-
-      auto& device_ctx = context.template device_context<DeviceContext>();
-      auto* scale_list = context.Output<framework::Tensor>("OutScales");
-      math::SetConstant<DeviceContext, T> scalar;
-      scale_list->mutable_data<T>(context.GetPlace());
-      scalar(device_ctx, scale_list, static_cast<T>(0));
-      auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
-      iter->mutable_data<T>(context.GetPlace());
-      scalar(device_ctx, iter, static_cast<T>(0));
-    } else if (quantize_type == std::string("range_abs_max")) {
-      auto* moving_scale = const_cast<framework::Tensor*>(
-          context.Input<framework::Tensor>("InMovingScale"));
-      if (is_test) {
-        scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
-      } else {
-        auto* it = const_cast<framework::Tensor*>(
-            context.Input<framework::Tensor>("InCurrentIter"));
-        auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
-        int* last_iter = it->mutable_data<int>(platform::CPUPlace());
-        int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
-        auto* scale_list = context.Output<framework::Tensor>("OutScales");
-        auto* saving_scale =
-            context.Output<framework::Tensor>("OutMovingScale");
-        scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
-        scale = FindRangeAbsMax(device_ctx, scale_list, saving_scale, scale,
-                                window_size, current_iter[0]);
-        (*current_iter) = (*last_iter) + 1;
-      }
-    } else if (quantize_type == std::string("moving_average_abs_max")) {
-      auto* moving_scale = const_cast<framework::Tensor*>(
-          context.Input<framework::Tensor>("InMovingScale"));
-      if (is_test) {
-        scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
-      } else {
-        scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
-        auto* saving_scale =
-            context.Output<framework::Tensor>("OutMovingScale");
-        scale = FindMovingAverageAbsMmax(
-            const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
-      }
-    }
-
-    ApplySaturateGpu<T>(device_ctx, in->numel(), in->data<T>(),
-                        tensor->mutable_data<T>(in->place()), -scale, scale);
-    scale = bin_cnt / scale;
+template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
-    eigen_out.device(dev) = (scale * eigen_in).round();
+template <typename T>
+struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, out_data);
   }
 };
 
+template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
+
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(fake_quantize,
-                        paddle::operators::FakeQuantizeCUDAKernel<
-                            paddle::platform::CUDADeviceContext, float>,
-                        paddle::operators::FakeQuantizeCUDAKernel<
-                            paddle::platform::CUDADeviceContext, double>);
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
+                        ops::FakeQuantizeAbsMaxKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
+                        ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 80f71d85dde39f773cc489fb79effcc775c5010a..7ace7573ec5c03ab8788cfc0aab614b7f80ea073 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -17,137 +17,91 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/clip_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
 
-using platform::Transform;
+template <typename DeviceContext, typename T>
+struct FindAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const T* in, const int num, T* out);
+};
 
 template <typename DeviceContext, typename T>
-class FakeQuantizeKernel : public framework::OpKernel<T> {
+struct ClipAndFakeQuantFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& scale, const int bin_cnt,
+                  framework::Tensor* out);
+};
+
+template <typename DeviceContext, typename T>
+struct FindRangeAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& cur_scale,
+                  const framework::Tensor& last_scale,
+                  const framework::Tensor& iter, const int window_size,
+                  framework::Tensor* scales_arr, framework::Tensor* out_scale);
+};
+
+template <typename DeviceContext, typename T>
+class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
  public:
-  T FindAbsMax(framework::Tensor* in, int n) const {
-    T* p = in->mutable_data<T>(platform::CPUPlace());
-    T abs_max = (T)0.00000001;
-    for (int i = 0; i < n; i++) {
-      T tmp = fabs(p[i]);
-      if (tmp > abs_max) abs_max = tmp;
-    }
-    return T(abs_max);
-  }
-  T FindRangeAbsMax(framework::Tensor* scale_list, framework::Tensor* out_scale,
-                    const T& cur_scale, int window_size,
-                    int current_iter) const {
-    T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
-    T remove_tmp = sl[current_iter];
-    sl[current_iter] = cur_scale;
-    T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
-    if (max_scale < cur_scale) {
-      max_scale = cur_scale;
-    } else if (fabs(remove_tmp - max_scale) < 1e-6) {
-      int size = (current_iter > window_size) ? window_size : current_iter;
-      max_scale = T(FindAbsMax(scale_list, size));
-    }
-    return max_scale;
-  }
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
 
-  T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
-                             framework::Tensor* out_scale,
-                             const T& cur_scale) const {
-    T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
-    T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
-    outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
-    return T(outs[0]);
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    T* out_s = out_scale->mutable_data<T>(context.GetPlace());
+
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    const T* in_data = in->data<T>();
+    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in_data, in->numel(), out_s);
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
+                                                bin_cnt, out);
   }
+};
 
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* tensor = context.Output<framework::Tensor>("Out");
+template <typename DeviceContext, typename T>
+class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
-    const bool is_test = context.Attr<bool>("is_test");
-    tensor->mutable_data<T>(in->place());
-
-    auto* oms_tensor = context.Output<framework::Tensor>("OutMovingScale");
-    oms_tensor->mutable_data<T>(in->place());
-
-    auto quantize_type =
-        static_cast<std::string>(context.Attr<std::string>("quantize_type"));
-    if (quantize_type == std::string("range_abs_max")) {
-      auto* oss_tensor = context.Output<framework::Tensor>("OutScales");
-      oss_tensor->mutable_data<T>(
-          context.Input<framework::Tensor>("InScales")->place());
-      auto* oci_tensor = context.Output<framework::Tensor>("OutCurrentIter");
-      oci_tensor->mutable_data<T>(
-          context.Input<framework::Tensor>("InCurrentIter")->place());
-    }
+    auto* in_scale = context.Input<framework::Tensor>("InScale");
 
-    T scale = static_cast<T>(1);
-    int window_size = context.Attr<int>("window_size");
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    bool is_test = context.Attr<bool>("is_test");
     int bit_length = context.Attr<int>("bit_length");
     int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
 
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto raw_in = framework::EigenVector<T>::Flatten(*in);
-    if (quantize_type == std::string("abs_max")) {
-      auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
-      auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
-      scale_out.device(dev) = raw_in.abs().maximum();
-      scale = scale_out(0);
-
-      auto& device_ctx = context.template device_context<DeviceContext>();
-      auto* scale_list = context.Output<framework::Tensor>("OutScales");
-      math::SetConstant<DeviceContext, T> scalar;
-      scale_list->mutable_data<T>(context.GetPlace());
-      scalar(device_ctx, scale_list, static_cast<T>(0));
-      auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
-      iter->mutable_data<T>(context.GetPlace());
-      scalar(device_ctx, iter, static_cast<T>(0));
-    } else if (quantize_type == std::string("range_abs_max")) {
-      auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
-      if (is_test) {
-        scale = moving_scale->data<T>()[0];
-      } else {
-        auto* it = context.Input<framework::Tensor>("InCurrentIter");
-        auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
-        const int* last_iter = it->data<int>();
-        int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
-        auto* scale_list = context.Output<framework::Tensor>("OutScales");
-        auto* saving_scale =
-            context.Output<framework::Tensor>("OutMovingScale");
-        auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
-        scale_out.device(dev) = raw_in.abs().maximum();
-        scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
-        scale = FindRangeAbsMax(scale_list, saving_scale, scale, window_size,
-                                current_iter[0]);
-        saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
-        (*current_iter) = (*last_iter) + 1;
-      }
-    } else if (quantize_type == std::string("moving_average_abs_max")) {
-      auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
-      if (is_test) {
-        scale = moving_scale->data<T>()[0];
-      } else {
-        auto* saving_scale =
-            context.Output<framework::Tensor>("OutMovingScale");
-        auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
-        scale_out.device(dev) = raw_in.abs().maximum();
-        scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
-        scale = FindMovingAverageAbsMmax(
-            const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
-        saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
-      }
+    // testing
+    if (is_test) {
+      ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
+                                                  bin_cnt, out);
+      return;
     }
 
-    Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), in->data<T>(),
-          in->data<T>() + in->numel(), tensor->mutable_data<T>(in->place()),
-          ClipFunctor<T>(-scale, scale));
-    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
-    eigen_out.device(dev) = (bin_cnt / scale * eigen_in).round();
+    // training
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    auto* out_scales = context.Output<framework::Tensor>("OutScales");
+    auto* iter = context.Input<framework::Tensor>("Iter");
+
+    int window_size = context.Attr<int>("window_size");
+    out_scale->mutable_data<T>(context.GetPlace());
+
+    framework::Tensor cur_scale;
+    T* cur_scale_data = cur_scale.mutable_data<T>({1}, context.GetPlace());
+    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
+                                          cur_scale_data);
+    FindRangeAbsMaxFunctor<DeviceContext, T>()(dev_ctx, cur_scale, *in_scale,
+                                               *iter, window_size, out_scales,
+                                               out_scale);
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
+                                                bin_cnt, out);
   }
 };
 
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index 925dc19061e2196a40411f415eb6e5ad59ab52ff..adc7cb1f9e48ba5fabeb91c5e3ecec016db34a45 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -25,7 +25,7 @@ struct FillOpVisitor {
       : tensor_(tensor), value_(value) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     platform::CPUPlace cpu;
     auto *data = tensor_->mutable_data<T>(cpu);
     std::transform(value_.data(), value_.data() + tensor_->numel(), data,
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index fdda01381e117cecffb2a05f8399f3ad82a46339..8e80dc0e641c443923076c31e269689b5bc134a7 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -157,6 +157,116 @@ class FlattenGradOp : public framework::OperatorBase {
   }
 };
 
+// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten,
+// the XShape is used to carry the shape and lod of X which will be used in
+// flatten_grad, in this way, the framework can reuse the memory of X
+// immediately the flatten2_op is finished.
+// Considering compatibility issues, we could not fix flatten2_op
+class Flatten2OpInferShape : public FlattenOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    FlattenOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output (XShape) of Flatten op should not be null.");
+    const auto &in_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      xshape_dims[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", "XShape");
+  }
+};
+
+class Flatten2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axis = Attr<int>("axis");
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = out_dims;
+    attrs["inplace"] = false;
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class Flatten2OpMaker : public FlattenOpMaker {
+ public:
+  void Make() override {
+    FlattenOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Flatten2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("flatten2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Flatten2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+
+class Flatten2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    attrs["inplace"] = false;
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -167,3 +277,8 @@ REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
                   ops::FlattenOpInferShape,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
+
+REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
+                  ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker);
+REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
+                  ops::Flatten2GradInferShape);
diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc
index 3a34aa86b6331e4fe2813eea97cb6644323807c3..916f84cb4a78c3721cb67bd3cf8d3759a8eaf1bf 100644
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fusion_gru_op.h"
+#include <cstring>  // for memcpy
 #include <string>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/gru_kernel.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
 namespace operators {
@@ -33,14 +30,7 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                  "Input(WeightX) of GRU should not be null.");
   PADDLE_ENFORCE(ctx->HasInput("WeightH"),
                  "Input(WeightH) of GRU should not be null.");
-
   PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
-                 "Output(BatchedGate) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
-                 "Output(BatchResetHiddenPrev) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
-                 "Output(BatchedHidden) of GRU should not be null.");
   PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                  "Output(Hidden) of GRU should not be null.");
 
@@ -83,12 +73,21 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
   }
   framework::DDim out_dims({x_dims[0], frame_size});
   ctx->SetOutputDim("Hidden", out_dims);
-  ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchedHidden", out_dims);
-  ctx->SetOutputDim("BatchResetHiddenPrev", out_dims);
   ctx->ShareLoD("X", "Hidden");
-
-  int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  int xx_width;
+  if (ctx->Attrs().Get<bool>("use_seq")) {
+    xx_width = wx_dims[1];
+  } else {
+    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                   "Output(ReorderedH0) of GRU should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                   "Output(BatchedInput) of GRU should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
+                   "Output(BatchedOut) of GRU should not be null.");
+    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedOut", out_dims);
+  }
   ctx->SetOutputDim("XX", {x_dims[0], xx_width});
   ctx->ShareLoD("X", "XX");
 }
@@ -115,22 +114,29 @@ void FusionGRUOpMaker::Make() {
            "(Tensor) The FC weight with shape (M x 3D),"
            "where M is the dim size of x, D is the hidden size. ");
   AddInput("WeightH",
-           "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. ");
+           "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
+           "This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
+           "Acutally they are D x 2D and D x D two part weights."
+           "{W_update, W_reset; W_state}"
+           "{D x (D + D); D x D}");
   AddInput("Bias",
            "(Tensor, optional) (1 x 3D)."
            "Almost same as GRUOp."
            "Note: if have FC bias it should be added on this bias.")
       .AsDispensable();
+  AddOutput("ReorderedH0", "(Tensor) (N x D), which N is the min-batch size.")
+      .AsIntermediate();
   AddOutput("XX",
-            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
+            "(LoDTensor) the result after X * WeightX (size is T x 3D)"
             " or batched_X (size is T x M), this will be automatically chosen,"
             " where T is the total time steps in this mini-batch,"
             " D is the hidden size, M is the dim size of x input.")
       .AsIntermediate();
-  AddOutput("BatchedGate", "(LoDTensor) Same as GRUOp").AsIntermediate();
-  AddOutput("BatchResetHiddenPrev", "(LoDTensor) (T x 3D) Same as GRUOp.")
+  AddOutput("BatchedInput",
+            "(LoDTensor) This is the batched result of input X"
+            "or the batched result after fc, shape (T x 3D)")
       .AsIntermediate();
-  AddOutput("BatchedHidden", "(LoDTensor) (T X D) Same as GRUOp.")
+  AddOutput("BatchedOut", "(LoDTensor) (T X D) save batched hidden.")
       .AsIntermediate();
   AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp");
   AddAttr<std::string>("activation",
@@ -146,6 +152,10 @@ void FusionGRUOpMaker::Make() {
                 "(bool, defalut: False) "
                 "whether to compute reversed GRU.")
       .SetDefault(false);
+  AddAttr<bool>("use_seq",
+                "(bool, defalut: True) "
+                "whether to use seq mode to compute GRU.")
+      .SetDefault(true);
   AddComment(R"DOC(
 The Fusion complete GRU Operator.
 This operator fuse the fully-connected operator into GRU, 
@@ -153,172 +163,261 @@ more details can refer to GRU op.
 )DOC");
 }
 
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, dst, indexed_src);
-}
-
-template <typename DeviceContext, typename T>
+template <typename T>
 class FusionGRUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    if (ctx.Attr<bool>("use_seq")) {
+      SeqCompute(ctx);
+    } else {
+      BatchCompute(ctx);
+    }
+  }
+
+#define INIT_VEC_FUNC                                                     \
+  std::function<void(const int, const T *, T *)> act_gate, act_state;     \
+  std::function<void(const int, const T*, const T*, const T*, T*)> cross; \
+  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");          \
+  auto& act_state_str = ctx.Attr<std::string>("activation");              \
+  if (platform::jit::MayIUse(platform::jit::avx)) {                       \
+    math::VecActivations<T, platform::jit::avx> act_functor;              \
+    act_gate = act_functor(act_gate_str);                                 \
+    act_state = act_functor(act_state_str);                               \
+    cross = math::vec_cross<T, platform::jit::avx>;                       \
+  } else {                                                                \
+    math::VecActivations<T, platform::jit::isa_any> act_functor;          \
+    act_gate = act_functor(act_gate_str);                                 \
+    act_state = act_functor(act_state_str);                               \
+    cross = math::vec_cross<T, platform::jit::isa_any>;                   \
+  }
+
+#define INIT_BASE_INPUT_OUTPUT                        \
+  auto* h0 = ctx.Input<Tensor>("H0");                 \
+  auto* wx = ctx.Input<Tensor>("WeightX");            \
+  auto* wh = ctx.Input<Tensor>("WeightH");            \
+  auto* bias = ctx.Input<Tensor>("Bias");             \
+  auto* xx = ctx.Output<LoDTensor>("XX");             \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");
+
+#define INIT_BASE_SIZES                  \
+  auto x_dims = x->dims();   /* T x M*/  \
+  auto wh_dims = wh->dims(); /* D x 3D*/ \
+  const int total_T = x_dims[0];         \
+  const int M = x_dims[1];               \
+  const int D = wh_dims[0];              \
+  const int D3 = wh_dims[1];             \
+  const int D2 = D * 2;
+
+  void SeqCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
     auto* x = ctx.Input<LoDTensor>("X");
-    auto* wx = ctx.Input<Tensor>("WeightX");
-    auto* wh = ctx.Input<Tensor>("WeightH");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* h0 = ctx.Input<Tensor>("H0");
-
-    auto* xx = ctx.Output<LoDTensor>("XX");
-    auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
-    auto* batch_reset_hidden_prev =
-        ctx.Output<LoDTensor>("BatchResetHiddenPrev");
-    auto* batch_hidden = ctx.Output<LoDTensor>("BatchedHidden");
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
 
+    auto x_lod = x->lod();
+    const int N = x_lod[0].size() - 1;
+    const T* x_data = x->data<T>();
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    const T* wx_data = wx->data<T>();
+    const T* wh_data = wh->data<T>();
+    const T* wh_state_data = wh_data + D * D2;
     T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
-    batch_reset_hidden_prev->mutable_data<T>(ctx.GetPlace());
-    batch_hidden->mutable_data<T>(ctx.GetPlace());
-    hidden_out->mutable_data<T>(ctx.GetPlace());
+    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
+
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
+                                      xx_data,
+                                      bias ? bias->data<T>() : nullptr);
+
+    int xx_offset = D3;
+    int gate_offset = D;
+    if (is_reverse) {
+      const int offset = (total_T - 1) * D;
+      xx_data = xx_data + offset * 3;
+      hidden_out_data = hidden_out_data + offset;
+      xx_offset = -D3;
+      gate_offset = -D;
+    }
+    auto move_step = [&]() {
+      xx_data = xx_data + xx_offset;
+      hidden_out_data = hidden_out_data + gate_offset;
+    };
+    for (int i = 0; i < N; ++i) {
+      int bid = is_reverse ? N - 1 - i : i;
+      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
+      const T* prev_hidden_data = nullptr;
+      int tstart = 0;
+      if (h0_data) {
+        prev_hidden_data = h0_data + bid * D;
+      } else {
+        // W: {W_update, W_reset; W_state}
+        // update gate
+        act_gate(D, xx_data, xx_data);
+        // state gate
+        act_state(D, xx_data + D2, xx_data + D2);
+        // out = a*b
+        blas.VMUL(D, xx_data, xx_data + D2, hidden_out_data);
+        // save prev
+        prev_hidden_data = hidden_out_data;
+        tstart = 1;
+        move_step();
+      }
+      for (int step = tstart; step < seq_len; ++step) {
+        // gemm prev * (Wu + Wr)
+        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast<T>(1),
+                  prev_hidden_data, D, wh_data, D2, static_cast<T>(1), xx_data,
+                  D3);
+        act_gate(D2, xx_data, xx_data);
+        // rt = rt*ht_1 inplace result
+        blas.VMUL(D, prev_hidden_data, xx_data + D, hidden_out_data);
+
+        // gemm rt * Ws
+        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
+                  hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
+                  xx_data + D2, D3);
+        act_state(D, xx_data + D2, xx_data + D2);
+        // out = zt*ht~ + (1-zt)*ht_1
+        cross(D, xx_data, xx_data + D2, prev_hidden_data, hidden_out_data);
+        // save prev
+        prev_hidden_data = hidden_out_data;
+        move_step();
+      }
+    }
+  }
+
+  void BatchCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto* x = ctx.Input<LoDTensor>("X");
+    if (x->lod()[0].size() == 2) {
+      SeqCompute(ctx);
+      return;
+    }
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
+
+    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
+    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
+    auto* batched_out = ctx.Output<LoDTensor>("BatchedOut");
 
     const T* x_data = x->data<T>();
     const T* wx_data = wx->data<T>();
     const T* wh_data = wh->data<T>();
-    auto x_dims = x->dims();
-    auto wx_dims = wx->dims();
+    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* batched_input_data = batched_input->mutable_data<T>(ctx.GetPlace());
+    T* batched_out_data = batched_out->mutable_data<T>(ctx.GetPlace());
+    hidden_out->mutable_data<T>(ctx.GetPlace());
+
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    if (x_dims[1] > wx_dims[1]) {
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        x_data, wx_data, xx_data,
-                                        bias ? bias->data<T>() : NULL);
-      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
+    if (M > D3) {
+      math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
+                                        xx_data,
+                                        bias ? bias->data<T>() : nullptr);
+      to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
     } else {
       to_batch(dev_ctx, *x, xx, true, is_reverse);
-      batched_gate->set_lod(xx->lod());
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        xx_data, wx_data, batched_gate_data,
-                                        bias ? bias->data<T>() : NULL);
+      batched_input->set_lod(xx->lod());
+      math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, xx_data, wx_data,
+                                        batched_input_data,
+                                        bias ? bias->data<T>() : nullptr);
     }
 
-    int frame_size = static_cast<int>(wx_dims[1] / 3);
-    math::GRUMetaValue<T> gru_value;
-    gru_value.gate_weight = const_cast<T*>(wh_data);
-    gru_value.state_weight =
-        const_cast<T*>(wh_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
-
-    framework::Vector<size_t> order(batched_gate->lod()[2]);
+    auto batched_lod = batched_input->lod();
+    const auto& seq_order = batched_lod[2];
+    const int max_bs = seq_order.size();
+    reordered_h0->Resize({max_bs, D});
 
+    int tstart = 0;
+    T* prev_hidden_data = nullptr;
     if (h0) {
-      ReorderInitState<DeviceContext, T>(
-          ctx.template device_context<DeviceContext>(), *h0, order, &ordered_h0,
-          true);
-      gru_value.prev_out_value = ordered_h0.data<T>();
+      // reorder h0
+      T* reordered_h0_data = reordered_h0->mutable_data<T>(ctx.GetPlace());
+      const T* h0_data = h0->data<T>();
+      prev_hidden_data = reordered_h0_data;
+      size_t sz = sizeof(T) * D;
+      for (int i = 0; i < max_bs; ++i) {
+        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
+        reordered_h0_data += D;
+      }
     } else {
-      gru_value.prev_out_value = nullptr;
+      // compute without h0
+      T* cur_in_data = batched_input_data;
+      T* cur_out_data = batched_out_data;
+      // W: {W_update, W_reset; W_state}
+      for (int i = 0; i < max_bs; ++i) {
+        // update gate
+        act_gate(D, cur_in_data, cur_in_data);
+        // state gate
+        act_state(D, cur_in_data + D2, cur_in_data + D2);
+        // out = a*b
+        blas.VMUL(D, cur_in_data, cur_in_data + D2, cur_out_data);
+        // add offset
+        cur_in_data += D3;
+        cur_out_data += D;
+      }
+      tstart = 1;
+      prev_hidden_data = batched_out_data;
     }
-    auto batch_starts = batched_gate->lod()[0];
-    size_t seq_len = batch_starts.size() - 1;
-    auto active_node =
-        math::detail::GetActivationType(ctx.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-
-#ifdef PADDLE_WITH_MKLML
-    // use MKL packed to speedup GEMM
-    if (FLAGS_paddle_num_threads >= 4) {
-      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-      T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
-                                       frame_size * 2 /*width of weight*/,
-                                       frame_size /*height of height*/);
-      PADDLE_ENFORCE(packed_gate);
-      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2,
-                     frame_size, T(1.0), gru_value.gate_weight, frame_size * 2,
-                     packed_gate);
-      T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
-                                        frame_size /*width of weight*/,
-                                        frame_size /*height of height*/);
-      PADDLE_ENFORCE(packed_state);
-      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size,
-                     frame_size, T(1.0), gru_value.state_weight, frame_size,
-                     packed_state);
-      for (size_t n = 0; n < seq_len; n++) {
-        int bstart = static_cast<int>(batch_starts[n]);
-        int bend = static_cast<int>(batch_starts[n + 1]);
-        int cur_batch_size = bend - bstart;
-
-        Tensor gate_t = batched_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        gru_value.output_value = hidden_t.data<T>();
-        gru_value.gate_value = gate_t.data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-
-        if (gru_value.prev_out_value) {
-          blas.GEMM_COMPUTE(
-              CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2,
-              frame_size, gru_value.prev_out_value, frame_size, packed_gate,
-              frame_size * 2, T(1), gru_value.gate_value, frame_size * 3);
-        }
-
-        math::detail::forward_reset_output(
-            math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_gate);
-
-        if (gru_value.prev_out_value) {
-          blas.GEMM_COMPUTE(
-              CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size,
-              gru_value.reset_output_value, frame_size, packed_state,
-              frame_size, T(1), gru_value.gate_value + frame_size * 2,
-              frame_size * 3);
-        }
-
-        math::detail::forward_final_output(
-            math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_node);
-
-        gru_value.prev_out_value = gru_value.output_value;
+    // Then start from next
+    const T* wh_state_data = wh_data + D * D2;
+    const auto& batch_starts = batched_lod[0];
+    const int max_seq_len = batch_starts.size() - 1;
+    batched_input_data = batched_input_data + tstart * max_bs * D3;
+    batched_out_data = batched_out_data + tstart * max_bs * D;
+    for (int step = tstart; step < max_seq_len; ++step) {
+      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+      // gemm prev * (Wu + Wr)
+      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D2, D, static_cast<T>(1),
+                prev_hidden_data, D, wh_data, D2, static_cast<T>(1),
+                batched_input_data, D3);
+
+      T* cur_batched_data = batched_input_data;
+      T* cur_out_data = batched_out_data;
+      T* cur_prev_hidden_data = prev_hidden_data;
+      for (int i = 0; i < cur_bs; ++i) {
+        act_gate(D2, cur_batched_data, cur_batched_data);
+        // rt = rt*ht_1 inplace result
+        blas.VMUL(D, cur_prev_hidden_data, cur_batched_data + D, cur_out_data);
+
+        cur_batched_data += D3;
+        cur_prev_hidden_data += D;
+        cur_out_data += D;
       }
 
-      blas.GEMM_FREE(packed_gate);
-      blas.GEMM_FREE(packed_state);
-    } else {
-#endif
-      for (size_t n = 0; n < seq_len; n++) {
-        int bstart = static_cast<int>(batch_starts[n]);
-        int bend = static_cast<int>(batch_starts[n + 1]);
-        int cur_batch_size = bend - bstart;
-
-        Tensor gate_t = batched_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        gru_value.output_value = hidden_t.data<T>();
-        gru_value.gate_value = gate_t.data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-
-        math::GRUUnitFunctor<DeviceContext, T>::compute(
-            dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
-            active_gate);
-
-        gru_value.prev_out_value = gru_value.output_value;
+      cur_batched_data = batched_input_data;
+      cur_out_data = batched_out_data;
+      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D, D, static_cast<T>(1),
+                cur_out_data, D, wh_state_data, D, static_cast<T>(1),
+                cur_batched_data + D2, D3);
+
+      cur_prev_hidden_data = prev_hidden_data;
+      for (int i = 0; i < cur_bs; ++i) {
+        // ht~ = act_state(...)
+        act_state(D, cur_batched_data + D2, cur_batched_data + D2);
+        // out = zt*ht~ + (1-zt)*ht_1
+        cross(D, cur_batched_data, cur_batched_data + D2, cur_prev_hidden_data,
+              cur_out_data);
+
+        cur_batched_data += D3;
+        cur_prev_hidden_data += D;
+        cur_out_data += D;
       }
-#ifdef PADDLE_WITH_MKLML
+      prev_hidden_data = batched_out_data;
+      batched_out_data = cur_out_data;
+      batched_input_data = cur_batched_data;
     }
-#endif
+
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden->set_lod(batched_gate->lod());
-    to_seq(dev_ctx, *batch_hidden, hidden_out);
+    batched_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_out, hidden_out);
   }
+#undef INIT_VEC_FUNC
+#undef INIT_BASE_SIZES
+#undef INIT_BASE_INPUT_OUTPUT
 };
 
 }  // namespace operators
@@ -327,6 +426,5 @@ class FusionGRUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OP_CPU_KERNEL(
-    fusion_gru, ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fusion_gru, ops::FusionGRUKernel<float>,
+                       ops::FusionGRUKernel<double>);
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
index e4e4ac8e333ba423e151dea05e40a0e41042570e..ef23ab3f981786d33567619ad0194d21f31bdc8e 100644
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -16,14 +16,10 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
-DEFINE_bool(seq_mode, true, "Use sequence mode");
-
 namespace paddle {
 namespace operators {
 
@@ -42,10 +38,6 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                  "Output(Hidden) of LSTM should not be null.");
   PADDLE_ENFORCE(ctx->HasOutput("Cell"),
                  "Output(Cell) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
-                 "Output(BatchedGate) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
-                 "Output(BatchedGate) of LSTM should not be null.");
 
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
@@ -86,27 +78,36 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
   PADDLE_ENFORCE_EQ(b_dims[0], 1,
                     "The first dimension of Input(Bias) should be 1.");
-
-  PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"),
-                 "Do not support peephole yet.");
-  PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
-                    "The second dimension of Input(Bias) should be "
-                    "4 * %d if disable peepholes connection",
-                    frame_size);
+  PADDLE_ENFORCE_EQ(
+      b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
+      "The second dimension of Input(Bias) should be "
+      "7 * %d if enable peepholes connection or"
+      "4 * %d if disable peepholes",
+      frame_size, frame_size);
 
   framework::DDim out_dims({x_dims[0], frame_size});
   ctx->SetOutputDim("Hidden", out_dims);
   ctx->SetOutputDim("Cell", out_dims);
-  ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchCellPreAct", out_dims);
   ctx->ShareLoD("X", "Hidden");
   ctx->ShareLoD("X", "Cell");
-
   int xx_width;
-  if (FLAGS_seq_mode) {
+  if (ctx->Attrs().Get<bool>("use_seq")) {
     xx_width = wx_dims[1];
   } else {
     xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                   "Output(BatchedInput) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
+                   "Output(BatchedHidden) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
+                   "Output(BatchedCell) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                   "Output(ReorderedH0) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
+                   "Output(ReorderedC0) of LSTM should not be null.");
+    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedHidden", out_dims);
+    ctx->SetOutputDim("BatchedCell", out_dims);
   }
   ctx->SetOutputDim("XX", {x_dims[0], xx_width});
   ctx->ShareLoD("X", "XX");
@@ -169,9 +170,11 @@ void FusionLSTMOpMaker::Make() {
             " where T is the total time steps in this mini-batch,"
             " D is the hidden size, M is the dim size of x input.")
       .AsIntermediate();
-  AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate();
-  AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).")
-      .AsIntermediate();
+  AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate();
+  AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate();
+  AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
+  AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
+  AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
   AddAttr<bool>("use_peepholes",
                 "(bool, defalut: True) "
                 "whether to enable diagonal/peephole connections.")
@@ -180,6 +183,10 @@ void FusionLSTMOpMaker::Make() {
                 "(bool, defalut: False) "
                 "whether to compute reversed LSTM.")
       .SetDefault(false);
+  AddAttr<bool>("use_seq",
+                "(bool, defalut: True) "
+                "whether to use seq mode to compute.")
+      .SetDefault(true);
   AddAttr<std::string>("gate_activation",
                        "(string, default: sigmoid)"
                        "The activation for input gate, forget gate and output "
@@ -203,286 +210,374 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
 )DOC");
 }
 
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  // TODO(TJ): check mem copy perf
-  row_shuffle(ctx, src, index_lod, dst, indexed_src);
-}
-
 template <typename T>
 class FuisonLSTMKernel : public framework::OpKernel<T> {
  public:
+#define INIT_VEC_FUNC                                                          \
+  std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
+  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");               \
+  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
+  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
+  if (platform::jit::MayIUse(platform::jit::avx)) {                            \
+    math::VecActivations<T, platform::jit::avx> act_functor;                   \
+    act_gate = act_functor(act_gate_str);                                      \
+    act_cell = act_functor(act_cell_str);                                      \
+    act_cand = act_functor(act_cand_str);                                      \
+  } else {                                                                     \
+    math::VecActivations<T, platform::jit::isa_any> act_functor;               \
+    act_gate = act_functor(act_gate_str);                                      \
+    act_cell = act_functor(act_cell_str);                                      \
+    act_cand = act_functor(act_cand_str);                                      \
+  }
+
+#define INIT_BASE_INPUT_OUTPUT                        \
+  auto* x = ctx.Input<LoDTensor>("X");                \
+  auto* h0 = ctx.Input<Tensor>("H0");                 \
+  auto* c0 = ctx.Input<Tensor>("C0");                 \
+  auto* wx = ctx.Input<Tensor>("WeightX");            \
+  auto* wh = ctx.Input<Tensor>("WeightH");            \
+  auto* bias = ctx.Input<Tensor>("Bias");             \
+  auto* xx = ctx.Output<LoDTensor>("XX");             \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
+  auto* cell_out = ctx.Output<LoDTensor>("Cell");     \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");     \
+  bool use_peepholes = ctx.Attr<bool>("use_peepholes");
+
+#define INIT_BASE_SIZES                  \
+  auto x_dims = x->dims();   /* T x M*/  \
+  auto wh_dims = wh->dims(); /* D x 4D*/ \
+  const int M = x_dims[1];               \
+  const int D = wh_dims[0];              \
+  const int D2 = D * 2;                  \
+  const int D3 = D * 3;                  \
+  const int D4 = wh_dims[1];
+
+#define INIT_BASE_INPUT_DATAS                                        \
+  const T* x_data = x->data<T>();                                    \
+  const T* wx_data = wx->data<T>();                                  \
+  const T* wh_data = wh->data<T>();                                  \
+  /* diagonal weight*/                                               \
+  const T* wc_data = bias->data<T>() + D4;                           \
+  /* for peephole only*/                                             \
+  Tensor checked_cell;                                               \
+  T* checked_cell_data = nullptr;                                    \
+  auto place = ctx.GetPlace();                                       \
+  if (use_peepholes) {                                               \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                 \
+    checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
+  }
+
+/// Compute LSTM
+#define GEMM_WH_ADDON(bs, prev, out)                                           \
+  blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
+            wh_data, D4, static_cast<T>(1), out, D4)
+
+// gates: W_ch, W_ih, W_fh, W_oh
+#define GET_Ct(ct_1, gates, ct)                   \
+  /* C_t = C_t-1 * fgated + cand_gated * igated*/ \
+  act_cand(D, gates, gates);                      \
+  blas.VMUL(D, gates, gates + D, gates + D);      \
+  blas.VMUL(D, ct_1, gates + D2, gates + D2);     \
+  blas.VADD(D, gates + D, gates + D2, ct)
+
+#define GET_Ht(ct, gates, ht)        \
+  /* H_t = act_cell(C_t) * ogated */ \
+  act_cell(D, ct, gates + D2);       \
+  blas.VMUL(D, gates + D2, gates + D3, ht)
+
+#define GET_Ct_NOH0C0(gates, ct)     \
+  /* C_t = igated * cgated*/         \
+  act_gate(D, gates + D, gates + D); \
+  act_cand(D, gates, gates);         \
+  blas.VMUL(D, gates, gates + D, ct)
+
+#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
+  GET_Ct_NOH0C0(gates, ct);                \
+  act_gate(D, gates + D3, gates + D3);     \
+  GET_Ht(ct, gates, ht)
+
+#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
+  GET_Ct_NOH0C0(gates, ct);                         \
+  /* get outgated, put W_oc * C_t on igated */      \
+  blas.VMUL(D, wc_data + D2, ct, gates + D);        \
+  blas.VADD(D, gates + D, gates + D3, gates + D3);  \
+  act_gate(D, gates + D3, gates + D3);              \
+  GET_Ht(ct, gates, ht)
+
+#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
+  act_gate(D3, gates + D, gates + D);     \
+  GET_Ct(ct_1, gates, ct);                \
+  GET_Ht(ct, gates, ht)
+
+#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht)        \
+  /* get fgated and igated*/                              \
+  blas.VMUL(D, wc_data, ct_1, checked_cell_data);         \
+  blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
+  blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
+  act_gate(D2, gates + D, gates + D);                     \
+  GET_Ct(ct_1, gates, ct);                                \
+  /* get ogated*/                                         \
+  blas.VMUL(D, wc_data + D2, ct, gates + D);              \
+  blas.VADD(D, gates + D, gates + D3, gates + D3);        \
+  act_gate(D, gates + D3, gates + D3);                    \
+  GET_Ht(ct, gates, ht)
+
   void SeqCompute(const framework::ExecutionContext& ctx) const {
     using DeviceContext = paddle::platform::CPUDeviceContext;
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* h0 = ctx.Input<Tensor>("H0");
-    auto* c0 = ctx.Input<Tensor>("C0");
-    auto* wx = ctx.Input<Tensor>("WeightX");
-    auto* wh = ctx.Input<Tensor>("WeightH");
-    auto* bias = ctx.Input<Tensor>("Bias");
-
-    auto* xx = ctx.Output<LoDTensor>("XX");
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
-
-    std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
-    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
-    auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
-    auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
-    if (platform::jit::MayIUse(platform::jit::avx)) {
-      math::VecActivations<T, platform::jit::avx> act_functor;
-      act_gate = act_functor(act_gate_str);
-      act_cell = act_functor(act_cell_str);
-      act_cand = act_functor(act_cand_str);
-    } else {
-      math::VecActivations<T, platform::jit::isa_any> act_functor;
-      act_gate = act_functor(act_gate_str);
-      act_cell = act_functor(act_cell_str);
-      act_cand = act_functor(act_cand_str);
-    }
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
+    INIT_BASE_INPUT_DATAS
 
     auto x_lod = x->lod();
-    auto x_dims = x->dims();    // T x M
-    auto wh_dims = wh->dims();  // D x 4D
     const int total_T = x_dims[0];
-    const int N = x_lod[0].size() - 1;  // batch size
-    const int M = x_dims[1];            // x frame size
-    const int D = wh_dims[0];
-    const int D2 = D * 2;
-    const int D3 = D * 3;
-    const int D4 = wh_dims[1];
-
-    const T* x_data = x->data<T>();
-    const T* h0_data = h0 ? h0->data<T>() : NULL;
-    const T* c0_data = c0 ? c0->data<T>() : NULL;
-    const T* wx_data = wx->data<T>();
-    const T* wh_data = wh->data<T>();
-    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
-    T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
-
+    const int N = x_lod[0].size() - 1;
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    const T* c0_data = c0 ? c0->data<T>() : nullptr;
+    T* xx_data = xx->mutable_data<T>(place);
+    T* h_out_data = hidden_out->mutable_data<T>(place);
+    T* c_out_data = cell_out->mutable_data<T>(place);
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
     math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
                                       xx_data, bias->data<T>());
+
     int xx_offset = D4;
     int gate_offset = D;
     if (is_reverse) {
       const int offset = (total_T - 1) * D;
       xx_data = xx_data + offset * 4;
-      hidden_out_data = hidden_out_data + offset;
-      cell_out_data = cell_out_data + offset;
+      h_out_data = h_out_data + offset;
+      c_out_data = c_out_data + offset;
       xx_offset = -D4;
       gate_offset = -D;
     }
 
-    auto move_step = [&]() {
-      xx_data = xx_data + xx_offset;
-      hidden_out_data = hidden_out_data + gate_offset;
-      cell_out_data = cell_out_data + gate_offset;
-    };
-
-    for (int i = 0; i < N; ++i) {
-      int bid = is_reverse ? N - 1 - i : i;
-      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
-      const T* prev_cell_data = NULL;
-      const T* prev_hidden_data = NULL;
-      int tstart = 0;
-      if (h0_data) {
-        prev_hidden_data = h0_data + bid * D;
-        prev_cell_data = c0_data + bid * D;
-      } else {
-        // W_ch, W_ih, W_fh, W_oh
-        act_gate(D3, xx_data + D, xx_data + D);
-        act_cand(D, xx_data, xx_data);
-        // cell out= input*tilde
-        blas.VMUL(D, xx_data, xx_data + D, cell_out_data);
-        // hidden out= act_state(cellout) * outgate
-        act_cell(D, cell_out_data, xx_data + D2);
-        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
-
-        // prev
-        prev_hidden_data = hidden_out_data;
-        prev_cell_data = cell_out_data;
-        tstart = 1;
-
-        move_step();
-      }
-      for (int step = tstart; step < seq_len; ++step) {
-        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
-                  prev_hidden_data, D, wh_data, D4, static_cast<T>(1), xx_data,
-                  D4);
-
-        // W_ch, W_ih, W_fh, W_oh
-        act_gate(D3, xx_data + D, xx_data + D);
-        act_cand(D, xx_data, xx_data);
-
-        // a = forget * prev_cell
-        blas.VMUL(D, xx_data + D2, prev_cell_data, xx_data + D2);
-
-        // b = input * tilde
-        blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
-
-        // cell out= a+b
-        blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data);
-
-        // hidden out= act_state(cellout) * outgate
-        act_cell(D, cell_out_data, xx_data + D2);
-        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
+#define MOVE_ONE_STEP                    \
+  prev_h_data = h_out_data;              \
+  prev_c_data = c_out_data;              \
+  xx_data = xx_data + xx_offset;         \
+  h_out_data = h_out_data + gate_offset; \
+  c_out_data = c_out_data + gate_offset
+
+#define PROCESS_H0C0_DEFINES                       \
+  int bid = is_reverse ? N - 1 - i : i;            \
+  int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \
+  const T* prev_c_data = nullptr;                  \
+  const T* prev_h_data = nullptr;                  \
+  int tstart = 0
+
+#define PROCESS_H0C0_PEEPHOLE                                      \
+  PROCESS_H0C0_DEFINES;                                            \
+  if (h0_data) {                                                   \
+    prev_h_data = h0_data + bid * D;                               \
+    prev_c_data = c0_data + bid * D;                               \
+  } else {                                                         \
+    COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
+    MOVE_ONE_STEP;                                                 \
+    tstart = 1;                                                    \
+  }
 
-        // prev
-        prev_hidden_data = hidden_out_data;
-        prev_cell_data = cell_out_data;
+#define PROCESS_H0C0                                      \
+  PROCESS_H0C0_DEFINES;                                   \
+  if (h0_data) {                                          \
+    prev_h_data = h0_data + bid * D;                      \
+    prev_c_data = c0_data + bid * D;                      \
+  } else {                                                \
+    COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
+    MOVE_ONE_STEP;                                        \
+    tstart = 1;                                           \
+  }
 
-        move_step();
+    if (use_peepholes) {
+      for (int i = 0; i < N; ++i) {
+        PROCESS_H0C0_PEEPHOLE
+        for (int step = tstart; step < seq_len; ++step) {
+          GEMM_WH_ADDON(1, prev_h_data, xx_data);
+          COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data);
+          MOVE_ONE_STEP;
+        }
+      }
+    } else {
+      for (int i = 0; i < N; ++i) {
+        PROCESS_H0C0
+        for (int step = tstart; step < seq_len; ++step) {
+          GEMM_WH_ADDON(1, prev_h_data, xx_data);
+          COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data);
+          MOVE_ONE_STEP;
+        }
       }
     }
+#undef PROCESS_H0C0_DEFINES
+#undef PROCESS_H0C0_PEEPHOLE
+#undef PROCESS_H0C0
+#undef MOVE_ONE_STEP
   }
 
   void BatchCompute(const framework::ExecutionContext& ctx) const {
     using DeviceContext = platform::CPUDeviceContext;
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* wx = ctx.Input<Tensor>("WeightX");
-    auto* wh = ctx.Input<Tensor>("WeightH");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* cell_t0 = ctx.Input<Tensor>("C0");
-
-    auto* xx = ctx.Output<LoDTensor>("XX");
-    auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
-
-    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
-    hidden_out->mutable_data<T>(ctx.GetPlace());
-    cell_out->mutable_data<T>(ctx.GetPlace());
-
-    const T* x_data = x->data<T>();
-    const T* wx_data = wx->data<T>();
-    auto x_dims = x->dims();
-    auto wx_dims = wx->dims();
+    INIT_BASE_INPUT_OUTPUT
+    if (x->lod()[0].size() == 2) {
+      SeqCompute(ctx);
+      return;
+    }
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
+    INIT_BASE_INPUT_DATAS
+
+    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
+    auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
+    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
+    auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
+    auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
+    T* xx_data = xx->mutable_data<T>(place);
+    T* batched_input_data = batched_input->mutable_data<T>(place);
+    T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
+    T* batched_h_out_data = batched_h_out->mutable_data<T>(place);
+    hidden_out->mutable_data<T>(place);
+    cell_out->mutable_data<T>(place);
 
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    if (x_dims[1] > wx_dims[1]) {
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        x_data, wx_data, xx_data,
-                                        bias->data<T>());
-      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
+    if (M > D4) {
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], D4, M, x_data, wx_data,
+                                        xx_data, bias->data<T>());
+      to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
     } else {
       to_batch(dev_ctx, *x, xx, true, is_reverse);
-      batched_gate->set_lod(xx->lod());
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        xx_data, wx_data, batched_gate_data,
+      batched_input->set_lod(xx->lod());
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], D4, M, xx_data,
+                                        wx_data, batched_input_data,
                                         bias->data<T>());
     }
 
-    int frame_size = static_cast<int>(wx_dims[1] / 4);
-    framework::DDim out_dims({x_dims[0], frame_size});
-    math::LstmMetaValue<T> lstm_value;
-    // no peephole
-    lstm_value.check_ig = nullptr;
-    lstm_value.check_fg = nullptr;
-    lstm_value.check_og = nullptr;
-    lstm_value.prev_state_value = nullptr;
-    Tensor ordered_c0;
-
-    framework::Vector<size_t> order(batched_gate->lod()[2]);
-
-    if (cell_t0) {
-      // Since the batch computing for LSTM reorders the input sequence
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(dev_ctx, *cell_t0, order, &ordered_c0,
-                                         true);
-      lstm_value.prev_state_value = ordered_c0.data<T>();
+    auto batched_lod = batched_input->lod();
+    const auto& seq_order = batched_lod[2];
+    const int max_bs = seq_order.size();
+    reordered_h0->Resize({max_bs, D});
+    reordered_c0->Resize({max_bs, D});
+
+    int tstart = 0;
+    T* prev_h_data = nullptr;
+    T* prev_c_data = nullptr;
+    if (h0) {
+      // reorder h0, c0
+      T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
+      T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
+      const T* h0_data = h0->data<T>();
+      const T* c0_data = c0->data<T>();
+      prev_h_data = reordered_h0_data;
+      prev_c_data = reordered_c0_data;
+      size_t sz = sizeof(T) * D;
+      for (int i = 0; i < max_bs; ++i) {
+        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
+        std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz);
+        reordered_h0_data += D;
+        reordered_c0_data += D;
+      }
+    } else {
+      // compute without h0, c0
+      T* cur_in_data = batched_input_data;
+      T* cur_h_out_data = batched_h_out_data;
+      T* cur_c_out_data = batched_c_out_data;
+      for (int i = 0; i < max_bs; ++i) {
+        GET_Ct_NOH0C0(cur_in_data, cur_c_out_data);
+        if (use_peepholes) {
+          blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D);
+          blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3);
+        }
+        act_gate(D, cur_in_data + D3, cur_in_data + D3);
+        GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data);
+        cur_in_data += D4;
+        cur_c_out_data += D;
+        cur_h_out_data += D;
+      }
+      tstart = 1;
+      prev_h_data = batched_h_out_data;
+      prev_c_data = batched_c_out_data;
     }
-
-    // Use the local variable as here.
-    LoDTensor batch_hidden, batch_cell;
-    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
-    batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_cell.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_cell_pre_act->mutable_data<T>(out_dims, ctx.GetPlace());
-
-    auto batch_starts = batched_gate->lod()[0];
-    size_t max_seq_len = batch_starts.size() - 1;
-    auto gate_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-
-    for (size_t n = 0; n < max_seq_len; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      Tensor gate_t = batched_gate->Slice(bstart, bend);
-      Tensor out_t = batch_hidden.Slice(bstart, bend);
-      Tensor cell_t = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
-
-      int cur_batch_size = bend - bstart;
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-        // TODO(TJ): use gemm directly
-        blas.MatMul(pre_hidden_t, false, *wh, false, static_cast<T>(1.0),
-                    &gate_t, static_cast<T>(1.0));
-      } else if (hidden_t0) {
-        // TODO(TJ): move h0 outside for
-        // If n == 0 and there is no initialized hidden state, that is to say
-        // the H0 is zeros, the calculation W_h * H0 will be skiped.
-        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
-
-        // Since the batch computing for LSTM reorders the input sequence
-        // according to their length. The initialized hidden state also needs
-        // to reorder.
-        Tensor ordered_h0;
-        ReorderInitState<DeviceContext, T>(dev_ctx, *hidden_t0, order,
-                                           &ordered_h0, true);
-        // TODO(TJ): use gemm directly
-        blas.MatMul(ordered_h0, false, *wh, false, static_cast<T>(1.0), &gate_t,
-                    static_cast<T>(1.0));
+    const auto& batch_starts = batched_lod[0];
+    const int max_seq_len = batch_starts.size() - 1;
+    const int offset = tstart * max_bs * D;
+    batched_input_data = batched_input_data + offset * 4;
+    batched_h_out_data = batched_h_out_data + offset;
+    batched_c_out_data = batched_c_out_data + offset;
+
+#define DEFINE_CUR                        \
+  T* cur_in_data = batched_input_data;    \
+  T* cur_prev_c_data = prev_c_data;       \
+  T* cur_c_out_data = batched_c_out_data; \
+  T* cur_h_out_data = batched_h_out_data
+
+#define MOVE_ONE_BATCH  \
+  cur_in_data += D4;    \
+  cur_prev_c_data += D; \
+  cur_c_out_data += D;  \
+  cur_h_out_data += D
+
+#define MOVE_ONE_STEP                  \
+  prev_c_data = batched_c_out_data;    \
+  prev_h_data = batched_h_out_data;    \
+  batched_c_out_data = cur_c_out_data; \
+  batched_h_out_data = cur_h_out_data; \
+  batched_input_data = cur_in_data
+
+    if (use_peepholes) {
+      for (int step = tstart; step < max_seq_len; ++step) {
+        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
+        DEFINE_CUR;
+        for (int i = 0; i < cur_bs; ++i) {
+          COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data,
+                                cur_h_out_data);
+          MOVE_ONE_BATCH;
+        }
+        MOVE_ONE_STEP;
+      }
+    } else {
+      for (int step = tstart; step < max_seq_len; ++step) {
+        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
+        DEFINE_CUR;
+        for (int i = 0; i < cur_bs; ++i) {
+          COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
+                       cur_h_out_data);
+          MOVE_ONE_BATCH;
+        }
+        MOVE_ONE_STEP;
       }
-
-      lstm_value.gate_value = gate_t.data<T>();
-      lstm_value.output_value = out_t.data<T>();
-      lstm_value.state_value = cell_t.data<T>();
-      lstm_value.state_active_value = cell_pre_act_t.data<T>();
-      math::LstmUnitFunctor<DeviceContext, T>::compute(
-          dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act,
-          cand_act);
-      lstm_value.prev_state_value = lstm_value.state_value;
     }
+#undef MOVE_ONE_STEP
+#undef MOVE_ONE_BATCH
+#undef DEFINE_CUR
 
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden.set_lod(batched_gate->lod());
-    // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(dev_ctx, batch_hidden, hidden_out);
-
-    batch_cell.set_lod(batched_gate->lod());
-    // restore the output cell state in LoDTensor from the batch cell
-    to_seq(dev_ctx, batch_cell, cell_out);
+    batched_h_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_h_out, hidden_out);
+    batched_c_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_c_out, cell_out);
   }
+
   void Compute(const framework::ExecutionContext& ctx) const override {
-    if (FLAGS_seq_mode) {
+    if (ctx.Attr<bool>("use_seq")) {
       SeqCompute(ctx);
     } else {
       BatchCompute(ctx);
     }
   }
+
+#undef COMPUTE_CtHt_PEEPHOLE
+#undef COMPUTE_CtHt
+#undef GET_Ct_NOH0C0
+#undef COMPUTE_CtHt_NOH0C0
+#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
+#undef GET_Ht
+#undef GET_Ct
+#undef GEMM_WH_ADDON
+#undef INIT_BASE_INPUT_DATAS
+#undef INIT_BASE_SIZES
+#undef INIT_BASE_INPUT_OUTPUT
+#undef INIT_VEC_FUNC
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 2d9faed648aef78da60706e13db3862080c96514..451ec61ba1f7239d92c6dfbad0b2961e74e1bc17 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel<T> {
               gate_data, frame_size * 3);
 
     // calculate activited gate
-    Eigen::array<int, 2> extents({{batch_size, frame_size}});
-    Eigen::array<int, 2> u_offsets({{0, 0}});
+    Eigen::array<int, 2> extents{{batch_size, frame_size}};
+    Eigen::array<int, 2> u_offsets{{0, 0}};
     ActCompute(context.Attr<int>("gate_activation"), place,
                g.slice(u_offsets, extents), g.slice(u_offsets, extents));
     auto u = g.slice(u_offsets, extents);  // update gate
-    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    Eigen::array<int, 2> r_offsets{{0, frame_size}};
     ActCompute(context.Attr<int>("gate_activation"), place,
                g.slice(r_offsets, extents), g.slice(r_offsets, extents));
     auto r = g.slice(r_offsets, extents);  // reset gate
@@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
               weight_data + frame_size * frame_size * 2, frame_size, 1,
               gate_data + frame_size * 2, frame_size * 3);
 
-    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
     ActCompute(context.Attr<int>("activation"), place,
                g.slice(c_offsets, extents), g.slice(c_offsets, extents));
     auto c = g.slice(c_offsets, extents);  // output candidate
@@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     int batch_size = input->dims()[0];
     int frame_size = hidden_prev->dims()[1];
 
-    Eigen::array<int, 2> extents({{batch_size, frame_size}});
-    Eigen::array<int, 2> u_offsets({{0, 0}});
+    Eigen::array<int, 2> extents{{batch_size, frame_size}};
+    Eigen::array<int, 2> u_offsets{{0, 0}};
     auto u = g.slice(u_offsets, extents);  // update gate
-    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    Eigen::array<int, 2> r_offsets{{0, frame_size}};
     auto r = g.slice(r_offsets, extents);  // reset gate
-    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
     auto c = g.slice(c_offsets, extents);  // output candidate
 
     // backward for unactivated update gate
diff --git a/paddle/fluid/operators/label_smooth_op.h b/paddle/fluid/operators/label_smooth_op.h
index f56fd95e96526c59e040fbbd2812360e59570a08..f3da17de011053fa118b5a4257bb5c3b00084741 100644
--- a/paddle/fluid/operators/label_smooth_op.h
+++ b/paddle/fluid/operators/label_smooth_op.h
@@ -38,7 +38,8 @@ class LabelSmoothKernel : public framework::OpKernel<T> {
       auto dist = framework::EigenVector<T>::Flatten(*dist_t);
       out.device(dev) =
           static_cast<T>(1 - epsilon) * in +
-          epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
+          static_cast<T>(epsilon) *
+              dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
     } else {
       out.device(dev) = static_cast<T>(1 - epsilon) * in +
                         static_cast<T>(epsilon / label_dim);
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index 0886c41a1b582881faf24f5531d414db4e4db71c..22343d7724b2f0dc01bff8c2274e3dd914bf70ef 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -67,27 +67,27 @@ template <typename T, int BlockDim>
 __global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
                                  T *y, T *mean, T *var, float epsilon,
                                  int feature_size) {
-  using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
+  using BlockReduce = cub::BlockReduce<PairForLayerNorm<double>, BlockDim>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   int beg_idx = blockIdx.x * feature_size + threadIdx.x;
   int end_idx = (blockIdx.x + 1) * feature_size;
 
   // Step 1: Reduce to calculate mean and var
-  T mean_val = static_cast<T>(0);
-  T var_val = static_cast<T>(0);
+  double mean_val = 0;
+  double var_val = 0;
   for (int i = beg_idx; i < end_idx; i += BlockDim) {
     T tmp = x[i];
     mean_val += tmp;
     var_val += (tmp * tmp);
   }
   auto pair = BlockReduce(temp_storage)
-                  .Reduce(PairForLayerNorm<T>(mean_val, var_val),
-                          PairForLayerNormAddFunctor<T>());
+                  .Reduce(PairForLayerNorm<double>(mean_val, var_val),
+                          PairForLayerNormAddFunctor<double>());
   if (threadIdx.x == 0) {
     auto tmp = pair.first_ / feature_size;
-    mean[blockIdx.x] = tmp;
-    var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp;
+    mean[blockIdx.x] = static_cast<T>(tmp);
+    var[blockIdx.x] = static_cast<T>(pair.second_ / feature_size - tmp * tmp);
   }
   __syncthreads();
   mean_val = mean[blockIdx.x];
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 00ba5ce8ee5e4084c8af204cfc37fe80c437f0d7..b3f7e0c0097b469998049a1db65d56a28cf02b5e 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index f5c10ced8305b64c6386c5051804f8c9a8f71802..58463dc4d6fd7cc3454de766814a947fee161070 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -57,7 +57,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
           memset(output + i * row_width, 0, row_width * sizeof(T));
         } else {
           PADDLE_ENFORCE_LT(ids[i], row_number);
-          PADDLE_ENFORCE_GE(ids[i], 0);
+          PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
           memcpy(output + i * row_width, table + ids[i] * row_width,
                  row_width * sizeof(T));
         }
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 1b75df5d7d97e54dfdc461660e53a368311e3778..d7f0f3c6280db7d121bf8821ec6d578e22a33da6 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,4 +1,6 @@
+if (NOT WIN32)
 add_subdirectory(detail)
+endif(NOT WIN32)
 
 function(math_library TARGET)
     # math_library is a function to create math library. 
@@ -38,9 +40,13 @@ math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
 math_library(cos_sim_functor)
 math_library(depthwise_conv)
-math_library(gru_compute DEPS activation_functions math_function)
 math_library(im2col)
+
+if (NOT WIN32) # windows do not support avx functions yet.
+math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
+endif (NOT WIN32)
+
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
 math_library(maxouting)
@@ -51,7 +57,9 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+if (NOT WIN32)
 math_library(matrix_bit_code)
+endif (NOT WIN32)
 math_library(unpooling)
 math_library(vol2col)
 
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 5693761e9ffd96b40040223b5498b63b0274bf0f..9560e3a3c15ca63892fbe3552679a22f027f11e2 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -132,6 +132,121 @@ inline void vec_scal<float, platform::jit::avx512_common>(const int n,
   vec_scal<float, platform::jit::avx2>(n, a, x, y);
 }
 
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+}
+
+template <>
+inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
+                                                    const float* x, float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_sub_ps(bias, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+#else
+  vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
+                                                     const float* x, float* y) {
+  vec_bias_sub<float, platform::jit::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n,
+                                                              const float a,
+                                                              const float* x,
+                                                              float* y) {
+  // TODO(TJ): enable me
+  vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
+}
+
+// out = x*y + (1-x)*z
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
+  for (int i = 0; i < n; ++i) {
+    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
+  }
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
+                                                 const float* y, const float* z,
+                                                 float* out) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(1.f);
+  __m256 tmpx, tmpy, tmpz;
+  for (i = 0; i < end; i += block) {
+    tmpx = _mm256_loadu_ps(x + i);
+    tmpy = _mm256_loadu_ps(y + i);
+    tmpz = _mm256_loadu_ps(z + i);
+    tmpy = _mm256_mul_ps(tmpx, tmpy);
+    tmpx = _mm256_sub_ps(bias, tmpx);
+    tmpz = _mm256_mul_ps(tmpx, tmpz);
+    tmpz = _mm256_add_ps(tmpy, tmpz);
+    _mm256_storeu_ps(out + i, tmpz);
+  }
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
+  }
+#else
+  vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+#endif
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
+                                                  const float* y,
+                                                  const float* z, float* out) {
+  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx512_common>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+  // TODO(TJ): enable me
+  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+}
+
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 9a6e646b28fdec78734eb4e7b98c8acf688b2645..5923792902a81521256de300f77955f1ea3d16c6 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -55,7 +55,7 @@ struct TensorSetConstantCPU {
   TensorSetConstantCPU(framework::Tensor* tensor, float value)
       : tensor_(tensor), value_(value) {}
   template <typename T>
-  void operator()() const {
+  void apply() const {
     auto cpu = platform::CPUPlace();
     auto* begin = tensor_->mutable_data<T>(cpu);
     std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 12d1baa8fb544a8b9684e43204c61ba410d1b295..79b7538ad05b0ff348b8264d50b63211b5254e80 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -52,7 +52,7 @@ struct TensorSetConstantGPU {
       : context_(context), tensor_(tensor), value_(value) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     SetConstant<platform::CUDADeviceContext, T> functor;
     functor(reinterpret_cast<const platform::CUDADeviceContext&>(context_),
             tensor_, static_cast<T>(value_));
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 5454d58f371afb5f5d6a1c3208318f80d4e0aa36..07854c83584f90db02b416b85a4aa61f5cdc0685 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -17,6 +17,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
+#if defined(_WIN32)
+#include <intrin.h>
+#include <windows.h>
+#endif  // _WIN32
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -55,12 +60,38 @@ namespace math {
  *    FindLastSet(x) = 1 + \floor*{\log_{2}x}
  * \f]
  */
+#if !defined(_WIN32)
 inline constexpr size_t FindLastSet(size_t x) {
   return std::is_same<size_t, unsigned int>::value
              ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
              : (std::is_same<size_t, unsigned long>::value  // NOLINT
                     ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
                     : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
+
+#else
+// windows don't have built-in clz, ctz function
+template <typename T>
+inline int ctz(const T& value) {
+  DWORD trailing_zero = 0;
+  if (_BitScanForward(&trailing_zero, value)) {
+    return static_cast<int>(trailing_zero);
+  } else {
+    return static_cast<int>(0);
+  }
+}
+
+template <typename T>
+inline int clz(const T& value) {
+  DWORD leadning_zero = 0;
+  if (_BitScanReverse(&leadning_zero, value)) {
+    return static_cast<int>(sizeof(T) * 8 - leadning_zero);
+  } else {
+    return static_cast<int>(0);
+  }
+}
+
+inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
+#endif  // !_WIN32
 }
 
 struct SimpleCode {
diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h
index 4166fb54946b7082f5f7dc0e232f636a1d2f8a13..e4d378dc23210e95605c6e09eda8a190cc5c6b4f 100644
--- a/paddle/fluid/operators/math/maxouting.h
+++ b/paddle/fluid/operators/math/maxouting.h
@@ -16,13 +16,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX __FLT_MAX__
-
 template <typename DeviceContext, typename T>
 class MaxOutFunctor {
  public:
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 2538d739cce95d1b2fc5b3f905af5e6d94cf7af5..120f5919803806e0d3b7dc8eaf530ae89819b84d 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -18,15 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX \
-  __FLT_MAX__  // TODO(zcd) :It might need to be placed in another file, but I'm
-               // still wondering where to put it.
-
 /*
  * \brief Extracting simple operations from pooling.
  *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
index 07372235a7c23832e528c3e852a4747f4244b833..a3186f82d0c0cc6c9585735ddf7e9bb4db7126cb 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor {
     // Calculate the start position of each batch.
     // example:  sequences = {s0, s1, s2}
     //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-    //           num_batch = 5,
+    //           max_seqlen = 5,
     //           batchIndex = {b0, b1, b2, b3, b4}
     //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
     //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
@@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor {
     //               where 1 is the second sequence,
     //                     0 is the first sequence,
     //                     2 is the third sequence.
-    // The num_batch represents batch size after rearranging the
+    // The max_seqlen represents batch size after rearranging the
     // input LodTensor. It is also the maximum length of input sequence.
 
     paddle::framework::LoD batch_lods;
@@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor {
     batch_lods.emplace_back(std::vector<size_t>{0});
 
     // batch_lods[0] is the start positions for batch LoDTensor
-    int num_batch = seq_info[0].length;
-    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
+    int max_seqlen = seq_info[0].length;
+    batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
     // batch_lods[1] is the raw index in the input LoDTensor
     batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
     // batch_lods[2] is the sort order for the input LoDTensor.
@@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor {
     size_t* batch_starts = batch_lods[0].data();
     size_t* seq2batch_idx = batch_lods[1].data();
     batch_starts[0] = 0;
-    for (int n = 0; n < num_batch; n++) {
+    for (int n = 0; n < max_seqlen; n++) {
       auto batch_id = static_cast<int>(batch_starts[n]);
       for (size_t i = 0; i < seq_info.size(); ++i) {
         int seq_len = seq_info[i].length;
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
index 625065692c1f32c89d9e566d00051e237ac9a3af..59d8b9b8a8d554eb16826712ff634eed5df2d648 100644
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -41,7 +41,7 @@ struct OneHotOpCUDAFunctor {
       : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
 
   template <typename OutT>
-  void operator()() const {
+  void apply() const {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h
index 7e77f25089c4bd0297b0eb5a0ed7555cc0af5a9f..1ebd2676496940ff8f90caaaded5c8227bd7ae78 100644
--- a/paddle/fluid/operators/one_hot_op.h
+++ b/paddle/fluid/operators/one_hot_op.h
@@ -31,7 +31,7 @@ struct OneHotOpFunctor {
       : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
 
   template <typename OutT>
-  void operator()() const {
+  void apply() const {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a706d05fd7c35ef993f5199f0f893622cb863c5d
--- /dev/null
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -0,0 +1,584 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+void Pad2DConstNCHW(const T* in_data, const int num, const int channels,
+                    const int in_height, const int in_width,
+                    const int out_height, const int out_width,
+                    const int pad_top, const int pad_left, T value,
+                    T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = out_h - pad_top;
+          int in_w = out_w - pad_left;
+          out_data[out_h * out_width + out_w] =
+              (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
+                  ? value
+                  : in_data[in_h * in_width + in_w];
+        }
+      }
+      in_data += in_height * in_width;
+      out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DConstNHWC(const T* in_data, const int num, const int channels,
+                    const int in_height, const int in_width,
+                    const int out_height, const int out_width,
+                    const int pad_top, const int pad_left, T value,
+                    T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        int in_h = out_h - pad_top;
+        int in_w = out_w - pad_left;
+        const int out_index = (out_h * out_width + out_w) * channels;
+        if (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width) {
+          for (int c = 0; c < channels; ++c) {
+            out_data[out_index + c] = value;
+          }
+        } else {
+          const int in_index = (in_h * in_width + in_w) * channels;
+          for (int c = 0; c < channels; ++c) {
+            out_data[out_index + c] = in_data[in_index + c];
+          }
+        }
+      }
+    }
+    in_data += in_height * in_width * channels;
+    out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DReflectNCHW(const T* in_data, const int num, const int channels,
+                      const int in_height, const int in_width,
+                      const int out_height, const int out_width,
+                      const int pad_top, const int pad_left, T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = out_h - pad_top;
+          int in_w = out_w - pad_left;
+          in_h = std::max(in_h, -in_h);  // reflect by 0
+          in_h =
+              std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+          in_w = std::max(in_w, -in_w);                  // reflect by 0
+          in_w =
+              std::min(in_w, 2 * in_width - in_w - 2);  // reflect by in_width
+          out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w];
+        }
+      }
+      in_data += in_height * in_width;
+      out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DReflectNHWC(const T* in_data, const int num, const int channels,
+                      const int in_height, const int in_width,
+                      const int out_height, const int out_width,
+                      const int pad_top, const int pad_left, T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        const int out_index = (out_h * out_width + out_w) * channels;
+        int in_h = out_h - pad_top;
+        int in_w = out_w - pad_left;
+        in_h = std::max(in_h, -in_h);
+        in_h = std::min(in_h, 2 * in_height - in_h - 2);
+        in_w = std::max(in_w, -in_w);
+        in_w = std::min(in_w, 2 * in_width - in_w - 2);
+        const int in_index = (in_h * in_width + in_w) * channels;
+
+        for (int c = 0; c < channels; ++c) {
+          out_data[out_index + c] = in_data[in_index + c];
+        }
+      }
+    }
+    in_data += in_height * in_width * channels;
+    out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DEdgeNCHW(const T* in_data, const int num, const int channels,
+                   const int in_height, const int in_width,
+                   const int out_height, const int out_width, const int pad_top,
+                   const int pad_left, T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+          int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+          out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w];
+        }
+      }
+      in_data += in_height * in_width;
+      out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DEdgeNHWC(const T* in_data, const int num, const int channels,
+                   const int in_height, const int in_width,
+                   const int out_height, const int out_width, const int pad_top,
+                   const int pad_left, T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        const int out_index = (out_h * out_width + out_w) * channels;
+        int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+        int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+        const int in_index = (in_h * in_width + in_w) * channels;
+        for (int c = 0; c < channels; ++c) {
+          out_data[out_index + c] = in_data[in_index + c];
+        }
+      }
+    }
+    in_data += in_height * in_width * channels;
+    out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DGradConstNCHW(T* d_in_data, const int num, const int channels,
+                        const int in_height, const int in_width,
+                        const int out_height, const int out_width,
+                        const int pad_top, const int pad_left,
+                        const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = out_h - pad_top;
+          int in_w = out_w - pad_left;
+          if (!(in_h < 0 || in_w < 0 || in_h >= in_height ||
+                in_w >= in_width)) {
+            d_in_data[in_h * in_width + in_w] =
+                d_out_data[out_h * out_width + out_w];
+          }
+        }
+      }
+      d_in_data += in_height * in_width;
+      d_out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DGradConstNHWC(T* d_in_data, const int num, const int channels,
+                        const int in_height, const int in_width,
+                        const int out_height, const int out_width,
+                        const int pad_top, const int pad_left,
+                        const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        int in_h = out_h - pad_top;
+        int in_w = out_w - pad_left;
+        const int out_index = (out_h * out_width + out_w) * channels;
+        if (!(in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)) {
+          const int in_index = (in_h * in_width + in_w) * channels;
+          for (int c = 0; c < channels; ++c) {
+            d_in_data[in_index + c] = d_out_data[out_index + c];
+          }
+        }
+      }
+    }
+    d_in_data += in_height * in_width * channels;
+    d_out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DGradReflectNCHW(T* d_in_data, const int num, const int channels,
+                          const int in_height, const int in_width,
+                          const int out_height, const int out_width,
+                          const int pad_top, const int pad_left,
+                          const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = out_h - pad_top;
+          int in_w = out_w - pad_left;
+          in_h = std::max(in_h, -in_h);  // reflect over 0
+          in_h = std::min(in_h,
+                          2 * in_height - in_h - 2);  // reflect over in_height
+          in_w = std::max(in_w, -in_w);               // reflect over 0
+          in_w =
+              std::min(in_w, 2 * in_width - in_w - 2);  // reflect over in_width
+          d_in_data[in_h * in_width + in_w] +=
+              d_out_data[out_h * out_width + out_w];
+        }
+      }
+      d_in_data += in_height * in_width;
+      d_out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DGradReflectNHWC(T* d_in_data, const int num, const int channels,
+                          const int in_height, const int in_width,
+                          const int out_height, const int out_width,
+                          const int pad_top, const int pad_left,
+                          const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        const int out_index = (out_h * out_width + out_w) * channels;
+        int in_h = out_h - pad_top;
+        int in_w = out_w - pad_left;
+        in_h = std::max(in_h, -in_h);
+        in_h = std::min(in_h, 2 * in_height - in_h - 2);
+        in_w = std::max(in_w, -in_w);
+        in_w = std::min(in_w, 2 * in_width - in_w - 2);
+        const int in_index = (in_h * in_width + in_w) * channels;
+        for (int c = 0; c < channels; ++c) {
+          d_in_data[in_index + c] += d_out_data[out_index + c];
+        }
+      }
+    }
+    d_in_data += in_height * in_width * channels;
+    d_out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DGradEdgeNCHW(T* d_in_data, const int num, const int channels,
+                       const int in_height, const int in_width,
+                       const int out_height, const int out_width,
+                       const int pad_top, const int pad_left,
+                       const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+          int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+          d_in_data[in_h * in_width + in_w] +=
+              d_out_data[out_h * out_width + out_w];
+        }
+      }
+      d_in_data += in_height * in_width;
+      d_out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DGradEdgeNHWC(T* d_in_data, const int num, const int channels,
+                       const int in_height, const int in_width,
+                       const int out_height, const int out_width,
+                       const int pad_top, const int pad_left,
+                       const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        const int out_index = (out_h * out_width + out_w) * channels;
+        int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+        int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+        const int in_index = (in_h * in_width + in_w) * channels;
+        for (int c = 0; c < channels; ++c) {
+          d_in_data[in_index + c] += d_out_data[out_index + c];
+        }
+      }
+    }
+    d_in_data += in_height * in_width * channels;
+    d_out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+class Pad2dCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    T value = context.Attr<T>("pad_value");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    auto in_dims = x->dims();
+    auto out_dims = out->dims();
+    const T* in_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const int pad_top = pads[0];
+    const int pad_left = pads[2];
+    const int num = in_dims[0];
+    if (data_format == "NCHW") {
+      const int channels = in_dims[1];
+      const int in_height = in_dims[2];
+      const int in_width = in_dims[3];
+      const int out_height = out_dims[2];
+      const int out_width = out_dims[3];
+      if (mode == "reflect") {
+        Pad2DReflectNCHW(in_data, num, channels, in_height, in_width,
+                         out_height, out_width, pad_top, pad_left, out_data);
+      } else if (mode == "edge") {
+        Pad2DEdgeNCHW(in_data, num, channels, in_height, in_width, out_height,
+                      out_width, pad_top, pad_left, out_data);
+      } else {
+        Pad2DConstNCHW(in_data, num, channels, in_height, in_width, out_height,
+                       out_width, pad_top, pad_left, value, out_data);
+      }
+    } else {
+      const int channels = in_dims[3];
+      const int in_height = in_dims[1];
+      const int in_width = in_dims[2];
+      const int out_height = out_dims[1];
+      const int out_width = out_dims[2];
+      if (mode == "reflect") {
+        Pad2DReflectNHWC(in_data, num, channels, in_height, in_width,
+                         out_height, out_width, pad_top, pad_left, out_data);
+      } else if (mode == "edge") {
+        Pad2DEdgeNHWC(in_data, num, channels, in_height, in_width, out_height,
+                      out_width, pad_top, pad_left, out_data);
+      } else {
+        Pad2DConstNHWC(in_data, num, channels, in_height, in_width, out_height,
+                       out_width, pad_top, pad_left, value, out_data);
+      }
+    }
+  }
+};
+
+template <typename T>
+class Pad2dGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto d_in_dims = d_in->dims();
+    auto d_out_dims = d_out->dims();
+    const T* d_out_data = d_out->data<T>();
+    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
+    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(context.template device_context<platform::CPUDeviceContext>(),
+             d_in, static_cast<T>(0));
+    const int pad_top = pads[0];
+    const int pad_left = pads[2];
+    const int num = d_in_dims[0];
+    if (data_format == "NCHW") {
+      const int channels = d_in_dims[1];
+      const int in_height = d_in_dims[2];
+      const int in_width = d_in_dims[3];
+      const int out_height = d_out_dims[2];
+      const int out_width = d_out_dims[3];
+      if (mode == "reflect") {
+        Pad2DGradReflectNCHW(d_in_data, num, channels, in_height, in_width,
+                             out_height, out_width, pad_top, pad_left,
+                             d_out_data);
+      } else if (mode == "edge") {
+        Pad2DGradEdgeNCHW(d_in_data, num, channels, in_height, in_width,
+                          out_height, out_width, pad_top, pad_left, d_out_data);
+      } else {
+        Pad2DGradConstNCHW(d_in_data, num, channels, in_height, in_width,
+                           out_height, out_width, pad_top, pad_left,
+                           d_out_data);
+      }
+    } else {
+      const int channels = d_in_dims[3];
+      const int in_height = d_in_dims[1];
+      const int in_width = d_in_dims[2];
+      const int out_height = d_out_dims[1];
+      const int out_width = d_out_dims[2];
+      if (mode == "reflect") {
+        Pad2DGradReflectNHWC(d_in_data, num, channels, in_height, in_width,
+                             out_height, out_width, pad_top, pad_left,
+                             d_out_data);
+      } else if (mode == "edge") {
+        Pad2DGradEdgeNHWC(d_in_data, num, channels, in_height, in_width,
+                          out_height, out_width, pad_top, pad_left, d_out_data);
+      } else {
+        Pad2DGradConstNHWC(d_in_data, num, channels, in_height, in_width,
+                           out_height, out_width, pad_top, pad_left,
+                           d_out_data);
+      }
+    }
+  }
+};
+
+class Pad2dOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of Pad2dOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of Pad2dOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE_EQ(x_dim.size(), 4,
+                      "Size of paddings should be equal to 4.");
+    std::vector<int64_t> out_dims(x_dim.size());
+
+    auto data_format = ctx->Attrs().Get<std::string>("data_format");
+    out_dims[0] = x_dim[0];
+    if (data_format == "NCHW") {
+      out_dims[1] = x_dim[1];
+      out_dims[2] = x_dim[2] + paddings[0] + paddings[1];  // height
+      out_dims[3] = x_dim[3] + paddings[2] + paddings[3];  // width
+    } else {                                               // NHWC
+      out_dims[3] = x_dim[3];
+      out_dims[1] = x_dim[1] + paddings[0] + paddings[1];
+      out_dims[2] = x_dim[2] + paddings[2] + paddings[3];
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    if (out_dims[0] == x_dim[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+};
+
+class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input of pad2d op. "
+             "The input should be a 4-D tensor with formate NCHW or NHWC.");
+    AddOutput("Out",
+              "The output of pad2d op. "
+              "A tensor with the same shape as X.");
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>) "
+        "A list<int> to describe the padding rules."
+        "paddings=[0, 1, 2, 3] means "
+        "padding 0 row to top, 1 row to bottom, 2 columns to left "
+        "and 3 columns to right. Size of paddings must be 4.");
+    AddAttr<float>("pad_value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas in constant mode.")
+        .SetDefault(0.0f);
+    AddAttr<std::string>("mode",
+                         "(float, default constant) "
+                         "Three modes: constant(default), reflect, edge.")
+        .SetDefault("constant");
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the input data.")
+        .SetDefault("NCHW");
+    AddComment(R"DOC(
+Pad2d Operator.
+Pad 2-d images accordding to 'paddings' and 'mode'. 
+If mode is 'reflect', paddings[0] and paddings[1] must be no greater
+than height-1. And the width dimension has the same condition.
+
+Given that X is a channel of image from input:
+
+X = [[1, 2, 3],
+     [4, 5, 6]]
+
+Case 0:
+
+paddings = [0, 1, 2, 3],
+mode = 'constant'
+pad_value = 0
+
+Out = [[0, 0, 1, 2, 3, 0, 0, 0]
+       [0, 0, 4, 5, 6, 0, 0, 0]
+       [0, 0, 0, 0, 0, 0, 0, 0]]
+
+Case 1:
+
+paddings = [0, 1, 2, 1],
+mode = 'reflect'
+
+Out = [[3, 2, 1, 2, 3, 2]
+       [6, 5, 4, 5, 6, 5]
+       [3, 2, 1, 2, 3, 2]]
+
+Case 2:
+
+paddings = [0, 1, 2, 1],
+mode = 'edge'
+
+Out = [[1, 1, 1, 2, 3, 3]
+       [4, 4, 4, 5, 6, 6]
+       [4, 4, 4, 5, 6, 6]]
+)DOC");
+  }
+};
+
+class Pad2dOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* bind = new framework::OpDesc();
+    bind->SetInput("X", Input("X"));
+    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    bind->SetAttrMap(Attrs());
+    bind->SetType("pad2d_grad");
+    return std::unique_ptr<framework::OpDesc>(bind);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(pad2d, ops::Pad2dOp, ops::Pad2dOpMaker,
+                  ops::Pad2dOpGradMaker);
+REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad);
+REGISTER_OP_CPU_KERNEL(pad2d, ops::Pad2dCPUKernel<float>);
+REGISTER_OP_CPU_KERNEL(pad2d_grad, ops::Pad2dGradCPUKernel<float>);
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9ba0ddbd84a43cfd5f028ce072b5c7606fae343d
--- /dev/null
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -0,0 +1,432 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+using framework::Tensor;
+
+template <typename T>
+__global__ void Pad2DConstNCHW(const int nthreads, const T* in_data,
+                               const int num, const int channels,
+                               const int in_height, const int in_width,
+                               const int out_height, const int out_width,
+                               const int pad_top, const int pad_left, T value,
+                               T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    out_data[index] =
+        (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[(nc * in_height + in_h) * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DConstNHWC(const int nthreads, const T* in_data,
+                               const int num, const int channels,
+                               const int in_height, const int in_width,
+                               const int out_height, const int out_width,
+                               const int pad_top, const int pad_left, T value,
+                               T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int in_h = out_h - pad_top;
+    const int in_w = out_w - pad_left;
+    out_data[index] =
+        (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[((n * in_height + in_h) * in_width + in_w) * channels +
+                      c];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DReflectNCHW(const int nthreads, const T* in_data,
+                                 const int num, const int channels,
+                                 const int in_height, const int in_width,
+                                 const int out_height, const int out_width,
+                                 const int pad_top, const int pad_left,
+                                 T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    in_h = max(in_h, -in_h);                     // reflect by 0
+    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+    in_w = max(in_w, -in_w);                     // reflect by 0
+    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+    out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DReflectNHWC(const int nthreads, const T* in_data,
+                                 const int num, const int channels,
+                                 const int in_height, const int in_width,
+                                 const int out_height, const int out_width,
+                                 const int pad_top, const int pad_left,
+                                 T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    in_h = max(in_h, -in_h);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = max(in_w, -in_w);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+    out_data[index] =
+        in_data[((n * in_height + in_h) * in_width + in_w) * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DEdgeNCHW(const int nthreads, const T* in_data,
+                              const int num, const int channels,
+                              const int in_height, const int in_width,
+                              const int out_height, const int out_width,
+                              const int pad_top, const int pad_left,
+                              T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+    out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DEdgeNHWC(const int nthreads, const T* in_data,
+                              const int num, const int channels,
+                              const int in_height, const int in_width,
+                              const int out_height, const int out_width,
+                              const int pad_top, const int pad_left,
+                              T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+    out_data[index] =
+        in_data[((n * in_height + in_h) * in_width + in_w) * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradConstNCHW(const int in_size, T* d_in_data,
+                                   const int num, const int channels,
+                                   const int in_height, const int in_width,
+                                   const int out_height, const int out_width,
+                                   const int pad_top, const int pad_left,
+                                   const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(in_index, in_size) {
+    int nc = in_index / in_width;
+    const int out_w = in_index % in_width + pad_left;
+    const int out_h = nc % in_height + pad_top;
+    nc /= in_height;
+    d_in_data[in_index] =
+        d_out_data[(nc * out_height + out_h) * out_width + out_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradConstNHWC(const int in_size, T* d_in_data,
+                                   const int num, const int channels,
+                                   const int in_height, const int in_width,
+                                   const int out_height, const int out_width,
+                                   const int pad_top, const int pad_left,
+                                   const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(in_index, in_size) {
+    int n = in_index / channels;
+    const int c = in_index % channels;
+    const int out_w = n % in_width + pad_left;
+    n /= in_width;
+    const int out_h = n % in_height + pad_top;
+    n /= in_height;
+    d_in_data[in_index] =
+        d_out_data[((n * out_height + out_h) * out_width + out_w) * channels +
+                   c];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradReflectNCHW(const int out_size, T* d_in_data,
+                                     const int num, const int channels,
+                                     const int in_height, const int in_width,
+                                     const int out_height, const int out_width,
+                                     const int pad_top, const int pad_left,
+                                     const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+    atomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
+              d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradReflectNHWC(const int out_size, T* d_in_data,
+                                     const int num, const int channels,
+                                     const int in_height, const int in_width,
+                                     const int out_height, const int out_width,
+                                     const int pad_top, const int pad_left,
+                                     const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+    in_h = min(in_h, in_height * 2 - in_h - 2);
+    in_w = min(in_w, in_width * 2 - in_w - 2);
+    atomicAdd(
+        &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradEdgeNCHW(const int out_size, T* d_in_data,
+                                  const int num, const int channels,
+                                  const int in_height, const int in_width,
+                                  const int out_height, const int out_width,
+                                  const int pad_top, const int pad_left,
+                                  const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+    atomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
+              d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
+                                  const int num, const int channels,
+                                  const int in_height, const int in_width,
+                                  const int out_height, const int out_width,
+                                  const int pad_top, const int pad_left,
+                                  const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+    atomicAdd(
+        &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+class Pad2dCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    T value = context.Attr<T>("pad_value");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    auto in_dims = x->dims();
+    auto out_dims = out->dims();
+    const T* in_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const int pad_top = pads[0];
+    const int pad_left = pads[2];
+    const int num = in_dims[0];
+
+    auto stream = context.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    const int out_size = out->numel();
+    int grid = (out_size + block - 1) / block;
+
+    if (data_format == "NCHW") {
+      const int channels = in_dims[1];
+      const int in_height = in_dims[2];
+      const int in_width = in_dims[3];
+      const int out_height = out_dims[2];
+      const int out_width = out_dims[3];
+      if (mode == "reflect") {
+        Pad2DReflectNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, out_data);
+      } else if (mode == "edge") {
+        Pad2DEdgeNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, out_data);
+      } else {
+        Pad2DConstNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, value, out_data);
+      }
+    } else {
+      const int channels = in_dims[3];
+      const int in_height = in_dims[1];
+      const int in_width = in_dims[2];
+      const int out_height = out_dims[1];
+      const int out_width = out_dims[2];
+      if (mode == "reflect") {
+        Pad2DReflectNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, out_data);
+      } else if (mode == "edge") {
+        Pad2DEdgeNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, out_data);
+      } else {
+        Pad2DConstNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, value, out_data);
+      }
+    }
+  }
+};
+
+template <typename T>
+class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto d_in_dims = d_in->dims();
+    auto d_out_dims = d_out->dims();
+    const T* d_out_data = d_out->data<T>();
+    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(context.template device_context<platform::CUDADeviceContext>(),
+             d_in, static_cast<T>(0));
+
+    const int pad_top = pads[0];
+    const int pad_left = pads[2];
+    const int num = d_in_dims[0];
+
+    auto stream = context.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    const int out_size = d_out->numel();
+    const int in_size = d_in->numel();
+    int grid = (out_size + block - 1) / block;
+
+    if (data_format == "NCHW") {
+      const int channels = d_in_dims[1];
+      const int in_height = d_in_dims[2];
+      const int in_width = d_in_dims[3];
+      const int out_height = d_out_dims[2];
+      const int out_width = d_out_dims[3];
+      if (mode == "reflect") {
+        Pad2DGradReflectNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      } else if (mode == "edge") {
+        Pad2DGradEdgeNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      } else {
+        grid = (in_size + block - 1) / block;
+        Pad2DGradConstNCHW<T><<<grid, block, 0, stream>>>(
+            in_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      }
+    } else {
+      const int channels = d_in_dims[3];
+      const int in_height = d_in_dims[1];
+      const int in_width = d_in_dims[2];
+      const int out_height = d_out_dims[1];
+      const int out_width = d_out_dims[2];
+      if (mode == "reflect") {
+        Pad2DGradReflectNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      } else if (mode == "edge") {
+        Pad2DGradEdgeNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      } else {
+        grid = (in_size + block - 1) / block;
+        Pad2DGradConstNHWC<T><<<grid, block, 0, stream>>>(
+            in_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(pad2d, ops::Pad2dCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(pad2d_grad, ops::Pad2dGradCUDAKernel<float>);
diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h
index f9076cbc678534fd5490fa0d7adeac0e50909a39..12f1525594ecf0887618616ffe563bd2bda32496 100644
--- a/paddle/fluid/operators/prelu_op.h
+++ b/paddle/fluid/operators/prelu_op.h
@@ -38,10 +38,9 @@ class PReluKernel : public framework::OpKernel<T> {
     auto dim = x->dims();
     int index = 0;
     int i = 0;
-    int temp = 0;
     if (mode == "channel") {
+      int temp = numel / (dim[0] * dim[1]);
       for (i = 0; i < numel; i++) {
-        temp = numel / (dim[0] * dim[1]);
         index = (i / temp) % dim[1];
         o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
       }
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index a1dfe39c3a4f84f5e4aaa2306813a7decf0e49ea..d72f85f2c44db2fa887732cfc05e1376a6a79e4a 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -246,6 +246,88 @@ class ReshapeGradKernel {
   }
 };
 
+// FIXME(zcd): reshape2 adds an intermediate output(XShape) based on reshape,
+// the XShape is used to carry the shape and lod of X which will be used in
+// reshape_grad, in this way, the framework can reuse the memory of X
+// immediately the reshape_op is finished.
+// Considering compatibility issues, we could not fix reshape_op
+class Reshape2Op : public ReshapeOp {
+ public:
+  Reshape2Op(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : ReshapeOp(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ReshapeOp::InferShape(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of ReshapeOp should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+
+class Reshape2OpMaker : public ReshapeOpMaker {
+ public:
+  void Make() override {
+    ReshapeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Reshape2GradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("reshape2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Reshape2GradOp : public framework::OperatorWithKernel {
+ public:
+  Reshape2GradOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = ctx->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
@@ -261,6 +343,17 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                ops::ReshapeGradKernel, int64_t,
                                ops::ReshapeGradKernel);
 
+REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
+                  ops::Reshape2GradMaker);
+REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
+
 #ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
                                 ops::ReshapeKernel, int, ops::ReshapeKernel,
@@ -269,4 +362,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                 double, ops::ReshapeGradKernel, int,
                                 ops::ReshapeGradKernel, int64_t,
                                 ops::ReshapeGradKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
 #endif
diff --git a/paddle/fluid/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc
index 919ebe48ca38040274bd2052b95ef96eccff4db6..2f773f222e50a440801b06a4fd997bf237b34772 100644
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
@@ -36,9 +36,13 @@ class RmspropOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(param_out) of RmspropOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(Momentum_out) of RmspropOp should not be null.");
+                   "Output(MomentOut) of RmspropOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
                    "Output(MeanSquareOut) of RmspropOp should not be null.");
+    if (ctx->Attrs().Get<bool>("centered")) {
+      PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"),
+                     "Output(MeanGradOut) of RmspropOp should not be null.");
+    }
 
     auto param_dim = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
@@ -58,6 +62,9 @@ class RmspropOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("MomentOut", param_dim);
     ctx->SetOutputDim("MeanSquareOut", param_dim);
+    if (ctx->Attrs().Get<bool>("centered")) {
+      ctx->SetOutputDim("MeanGradOut", param_dim);
+    }
   }
 };
 
@@ -70,6 +77,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("MeanSquare",
              "(Tensor, default Tensor<float>)"
              " The mean square value that gets updated.");
+    AddInput("MeanGrad",
+             "(Tensor, default Tensor<float>)"
+             " The moving average of gradient")
+        .AsDispensable();
     AddInput("LearningRate",
              "(Tensor, default Tensor<float>) "
              "The learning rate should be a tensor of size 1.");
@@ -82,6 +93,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
     AddOutput("MomentOut", "(Tensor) Output updated moment.");
     AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
+    AddOutput("MeanGradOut",
+              "(Tensor) Output moving average of gradient updated value.");
 
     AddAttr<float>("epsilon",
                    "(float, default 1e-10) Constant "
@@ -93,6 +106,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0.9f);
     AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
         .SetDefault(0.0f);
+    AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Rmsprop Optimizer. 
 
@@ -103,6 +118,14 @@ MomentOut = momentum * Moment +
 ParamOut = Param -  MomentOut
 $$
 
+if centered is true:
+
+mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
+mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
+mom = momentum * mom{t-1} + learning_rate * g_t /
+    sqrt(mean_square - mean_grad**2 + epsilon)
+param -= mom
+
 The original slides that proposed Rmsprop: Slide 29 of
 http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 
diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h
index 12836f43bde47ac87eb0af33dea501593b659a5d..25ed32c5ebb2ff5be962ac1e3e38c970623d705c 100644
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -41,6 +41,7 @@ class RmspropOpKernel : public framework::OpKernel<T> {
     float epsilon = ctx.Attr<float>("epsilon");
     float rho = ctx.Attr<float>("decay");
     float momentum = ctx.Attr<float>("momentum");
+    bool centered = ctx.Attr<bool>("centered");
 
     auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
     auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
@@ -53,12 +54,24 @@ class RmspropOpKernel : public framework::OpKernel<T> {
     auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+    Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
 
     ms_out.device(place) = rho * ms + (1 - rho) * g * g;
-    mom_out.device(place) =
-        momentum * mom +
-        lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+    if (centered) {
+      auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad"));
+      auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
+      mean_grad_out->mutable_data<T>(ctx.GetPlace());
+      auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
+
+      mg_out.device(place) = rho * mg + (1 - rho) * g;
+      mom_out.device(place) = momentum * mom +
+                              lr.broadcast(grad_dsize) * g /
+                                  (ms_out - mg_out.square() + epsilon).sqrt();
+    } else {
+      mom_out.device(place) =
+          momentum * mom +
+          lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+    }
     p_out.device(place) = p - mom_out;
   }
 };
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 50450b62f7b1c0b2b5abf01a43581a0e2d2cd01e..46e20285db6d7acd39dead3994409645adddf494 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -31,7 +31,7 @@ static inline int NumBlocks(const int N) {
 
 template <typename T>
 __global__ void GPUROIPoolForward(
-    const int nthreads, const T* input_data, const int64_t* input_rois,
+    const int nthreads, const T* input_data, const T* input_rois,
     const float spatial_scale, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
@@ -43,7 +43,7 @@ __global__ void GPUROIPoolForward(
     int c = (i / pooled_width / pooled_height) % channels;
     int n = i / pooled_width / pooled_height / channels;
 
-    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    const T* offset_input_rois = input_rois + n * kROISize;
     int roi_batch_ind = roi_batch_id_data[n];
     int roi_start_w = round(offset_input_rois[0] * spatial_scale);
     int roi_start_h = round(offset_input_rois[1] * spatial_scale);
@@ -93,7 +93,7 @@ __global__ void GPUROIPoolForward(
 
 template <typename T>
 __global__ void GPUROIPoolBackward(
-    const int nthreads, const int64_t* input_rois, const T* output_grad,
+    const int nthreads, const T* input_rois, const T* output_grad,
     const int64_t* argmax_data, const int num_rois, const float spatial_scale,
     const int channels, const int height, const int width,
     const int pooled_height, const int pooled_width, int* roi_batch_id_data,
@@ -174,8 +174,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
 
     GPUROIPoolForward<
         T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
-        channels, height, width, pooled_height, pooled_width,
+        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
+        height, width, pooled_height, pooled_width,
         roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()),
         argmax->mutable_data<int64_t>(ctx.GetPlace()));
   }
@@ -228,7 +228,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
       if (output_grad_size > 0) {
         GPUROIPoolBackward<
             T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-            output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
+            output_grad_size, rois->data<T>(), out_grad->data<T>(),
             argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
             width, pooled_height, pooled_width,
             roi_batch_id_list_gpu.data<int>(),
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index c4f739b2c6b2d62ebebcc15fd627ebad040e7b3f..07de7c9f0e070cef7c6f38f8d564ab76910842db 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -72,7 +72,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
     T* output_data = out->mutable_data<T>(ctx.GetPlace());
     int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
 
-    const int64_t* rois_data = rois->data<int64_t>();
+    const T* rois_data = rois->data<T>();
     for (int n = 0; n < rois_num; ++n) {
       int roi_batch_id = roi_batch_id_data[n];
       int roi_start_w = round(rois_data[0] * spatial_scale);
@@ -171,7 +171,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
         }
       }
 
-      const int64_t* rois_data = rois->data<int64_t>();
+      const T* rois_data = rois->data<T>();
       const T* out_grad_data = out_grad->data<T>();
       const int64_t* argmax_data = argmax->data<int64_t>();
       T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index 01308e416a9313bad13ded4e40c79bb0550e31ed..133d3f72dbd6ab13c98d124369038309c94cba5b 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -53,7 +53,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
         static_cast<T>(context.Attr<float>("min")),
         static_cast<T>(context.Attr<float>("max")));
 
-    std::vector<T> ids(batch_size);
+    std::vector<int64_t> ids(batch_size);
     for (int i = 0; i < batch_size; ++i) {
       T r = dist(engine);
       int idx = width - 1;
@@ -63,7 +63,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
           break;
         }
       }
-      ids[i] = ins_vector[idx];
+      ids[i] = int64_t(idx);
     }
 
     std::vector<int64_t> out_dim;
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index cfee9207083b46f7c27354f22e82a7d3c38a027c..5b05f757c0355ed15617dea925b5d4929fcbfee0 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdint.h>
-#include <sys/stat.h>
 #include <fstream>
 #include <numeric>
 #include <sstream>
@@ -23,40 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
 
-// TODO(sidgoyal78): These function are needed by other files (save_op), move
-// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
-constexpr char kSEP = '/';
-static bool FileExists(const std::string &filepath) {
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-}
-
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-static void MkDir(const char *path) {
-  if (mkdir(path, 0755)) {
-    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
-  }
-}
-
-static void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
-
 class SaveCombineOp : public framework::OperatorBase {
  public:
   SaveCombineOp(const std::string &type,
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 85de37416b5f24128ee98320a872eafffe967c81..e79cffcf498c52ed14db235f6221cfdf08399c9d 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdint.h>
-#include <sys/stat.h>
 #include <fstream>
 #include <numeric>
 
@@ -25,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
@@ -33,36 +33,6 @@ namespace operators {
 // to directory specified.
 constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
 
-// TODO(yuyang18): If the functions below are needed by other files, move them
-// to paddle::filesystem namespace.
-constexpr char kSEP = '/';
-static bool FileExists(const std::string &filepath) {
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-}
-
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-static void MkDir(const char *path) {
-  if (mkdir(path, 0755)) {
-    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
-  }
-}
-
-static void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
-
 class SaveOp : public framework::OperatorBase {
  public:
   SaveOp(const std::string &type, const framework::VariableNameMap &inputs,
diff --git a/paddle/fluid/operators/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_enumerate_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58e48c228bb34814700fd0f7a3d62ef4b1a435dd
--- /dev/null
+++ b/paddle/fluid/operators/sequence_enumerate_op.cc
@@ -0,0 +1,97 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_enumerate_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceEnumerateOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of SequecceEnumerate operator should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(X) of SequenceEnumerate operator should not be null.");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2UL,
+        "Input(X) of SequenceEnumerate operator's rank should be 2.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[1], 1UL,
+        "Input(X) of SequenceEnumerate operator's 2nd dimension should be 1.");
+
+    const auto win_size = ctx->Attrs().Get<int>("win_size");
+    ctx->SetOutputDim("Out", {x_dims[0], win_size});
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(2-D LoDTensor with the 2nd dimension equal to 1) "
+             "Input LoDTensor of SequenceEnumerate operator.");
+    AddOutput("Out",
+              "(2-D LoDTensor with the 2nd dimension equal to win_size) "
+              "Output LoDTensor of SequenceEnumerate operator.");
+    AddAttr<int>("win_size", "(int) The enumerate sequence window size.")
+        .AddCustomChecker([](const int& win_size) {
+          PADDLE_ENFORCE(win_size >= 2,
+                         "The window size should be not less than 2.");
+        });
+    AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+Sequence Enumerate Operator.
+
+Generate a new sequence for the input index sequence, which enumerates all the
+sub-sequences with length `win_size` of the input. 
+The enumerated sequence has the same 1st dimension with variable `input`, and
+the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
+    
+Examples:
+Case 1:
+  Input:
+    X.lod = [[0, 3, 5]]
+    X.data = [[1], [2], [3], [4], [5]]
+    X.dims = [5, 1]
+  Attrs:
+    win_size = 2
+    pad_value = 0
+  Output:
+    Out.lod = [[0, 3, 5]]
+    Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
+    Out.dims = [5, 2]
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(sequence_enumerate, ops::SequenceEnumerateOp,
+                             ops::SequenceEnumerateOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sequence_enumerate,
+    ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int32_t>,
+    ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_enumerate_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bdc9a615aa9a1ecd99c1f6995361f8c5ff0aa383
--- /dev/null
+++ b/paddle/fluid/operators/sequence_enumerate_op.cu
@@ -0,0 +1,84 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/fluid/operators/sequence_enumerate_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+__global__ void CalcOutPut(const T* in_data, const size_t* in_lod,
+                           const size_t lod_len, const int64_t win_size,
+                           const int64_t pad_value, T* out_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_lod[lod_len - 1]) {
+    int end_idx = 0;
+    // Get LoD interval of index
+    for (int i = 1; i < lod_len; ++i) {
+      if (index < in_lod[i]) {
+        end_idx = in_lod[i];
+        break;
+      }
+    }
+    for (size_t i = 0; i < win_size; ++i) {
+      int word_pos = index + i;
+      out_data[index * win_size + i] =
+          word_pos < end_idx ? in_data[word_pos] : pad_value;
+    }
+  }
+}
+
+template <typename T>
+class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int win_size = context.Attr<int>("win_size");
+    int pad_value = context.Attr<int>("pad_value");
+
+    auto in_dims = in->dims();
+    auto in_lod = in->lod();
+
+    PADDLE_ENFORCE_EQ(
+        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
+        "The actual input data's size mismatched with LoD information.");
+
+    /* Generate enumerate sequence set */
+    auto stream = context.cuda_device_context().stream();
+    auto lod0 = in_lod[0];
+    auto in_len = in->numel();
+    auto in_data = in->data<T>();
+    auto out_data = out->mutable_data<T>(context.GetPlace());
+    // Copy LoD to GPU
+    const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace());
+    // Calc output tensor
+    CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                 PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        in_data, dev_in_lod_ptr, lod0.size(), win_size, pad_value, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    sequence_enumerate,
+    paddle::operators::SequenceEnumerateOpCUDAKernel<int32_t>,
+    paddle::operators::SequenceEnumerateOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_enumerate_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc18d9b2071303377505155476b87ed029eaf986
--- /dev/null
+++ b/paddle/fluid/operators/sequence_enumerate_op.h
@@ -0,0 +1,56 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class SequenceEnumerateKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int win_size = context.Attr<int>("win_size");
+    int pad_value = context.Attr<int>("pad_value");
+
+    auto in_dims = in->dims();
+    auto in_lod = in->lod();
+
+    PADDLE_ENFORCE_EQ(
+        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
+        "The actual input data's size mismatched with LoD information.");
+
+    // Generate enumerate sequence set
+    auto lod0 = in_lod[0];
+    auto in_data = in->data<T>();
+    auto out_data = out->mutable_data<T>(context.GetPlace());
+    for (size_t i = 0; i < lod0.size() - 1; ++i) {
+      for (size_t idx = lod0[i]; idx < lod0[i + 1]; ++idx) {
+        for (int word_idx = 0; word_idx < win_size; ++word_idx) {
+          size_t word_pos = idx + word_idx;
+          out_data[win_size * idx + word_idx] =
+              word_pos < lod0[i + 1] ? in_data[word_pos] : pad_value;
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_mask_op.h b/paddle/fluid/operators/sequence_mask_op.h
index 0dd554adfe57e469c2fac17f27adae2db7003a6a..18acb735cecabd1e01f7821c880fd8ed5e52971f 100644
--- a/paddle/fluid/operators/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_mask_op.h
@@ -99,7 +99,7 @@ struct SequenceMaskFunctor {
       : ctx_(ctx), x_(x), y_(y), limits_(limits), maxlen_(maxlen) {}
 
   template <typename Ty>
-  void operator()() const {
+  void apply() const {
     auto *y_data = y_->mutable_data<Ty>(ctx_.GetPlace());
     platform::ForRange<DeviceContext> for_range(ctx_, limits_);
     for_range(SequenceMaskForRangeFunctor<Tx, Ty>(x_, y_data, maxlen_));
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index 8146c5f56104b7dec86b1c4491ed10fc2e94b58b..29d2fb989754f5621222768a279a1c898ea1c355 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -62,7 +62,10 @@ class ShrinkRNNMemoryOp : public ArrayOp {
     }
 
     if (dst_num_rows != 0) {
-      out_tensor.ShareDataWith(x_tensor.Slice(0, height));
+      out_tensor.mutable_data(place, x_tensor.type());
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx,
+                            &out_tensor);
     }
   }
 };
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 8a683116b8054de12fc4419b5aa5fbc019b675bb..e389c6a65e1e8220685294931c4d08e6fd928b7f 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -126,15 +126,15 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault({});
     AddComment(R"DOC(
         Squeeze Operator.
-        
-        Remove single-dimensional entries from the shape of a tensor. 
-        Takes a parameter axes with a list of axes to squeeze. 
-        If axes is not provided, all the single dimensions will be removed from the shape. 
+
+        Remove single-dimensional entries from the shape of a tensor.
+        Takes a parameter axes with a list of axes to squeeze.
+        If axes is not provided, all the single dimensions will be removed from the shape.
         If an axis is selected with shape entry not equal to one, an error is raised.
-        
+
         Examples:
         Case 1:
-          Given 
+          Given
             X.shape = (1, 3, 1, 5)
           and
             axes = [0]
@@ -144,7 +144,7 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
         Case 2:
           Given
             X.shape = (1, 3, 1, 5)
-          and 
+          and
             axes = []
           we get:
             Out.shape = (3, 5)
@@ -181,6 +181,113 @@ class SqueezeGradOp : public framework::OperatorBase {
   }
 };
 
+// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
+// the XShape is used to carry the shape and lod of X which will be used in
+// squeeze_grad, in this way, the framework can reuse the memory of X
+// immediately the squeeze2_op is finished.
+// Considering compatibility issues, we could not fix squeeze2_op
+class Squeeze2OpMaker : public SqueezeOpMaker {
+ public:
+  void Make() override {
+    SqueezeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in SqueezeGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Squeeze2OpInferShape : public SqueezeOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    SqueezeOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of Squeeze operator should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+
+class Squeeze2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class Squeeze2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("squeeze2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Squeeze2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+
+class Squeeze2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -192,3 +299,8 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
                   ops::SqueezeOpInferShape,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
+
+REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
+                  ops::Squeeze2OpInferShape, ops::Squeeze2GradOpMaker);
+REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
+                  ops::Squeeze2GradInferShape);
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 60556a564c25c08612447ebd47a4b432b8a12d29..6a9fc6611a8f8eaa6749aefac0673ccabaebbcfe 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/transpose_op.h"
+#include <string>
 #include <vector>
 
 namespace paddle {
@@ -24,7 +25,7 @@ class TransposeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
     auto x_dims = ctx->GetInputDim("X");
@@ -90,7 +91,7 @@ The behavior of this operator is similar to how `numpy.transpose` works.
          2 &5
     \end{pmatrix}$$
 
-- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is 
+- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is
 $[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$.
 
 )DOC");
@@ -101,7 +102,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
@@ -113,6 +114,93 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+// FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
+// transpose, the XShape is used to carry the shape and lod of X which
+// will be used in transpose_grad, in this way, the framework can reuse
+// the memory of X immediately the transpose2_op is finished.
+// Considering compatibility issues, we could not fix transpose2_op
+class Transpose2Op : public TransposeOp {
+ public:
+  Transpose2Op(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : TransposeOp(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    TransposeOp::InferShape(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) should not be null");
+    const auto &in_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> x_shape_dim(in_dims.size() + 1);
+    x_shape_dim[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      x_shape_dim[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(x_shape_dim));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class Transpose2OpMaker : public TransposeOpMaker {
+ public:
+  void Make() override {
+    TransposeOpMaker::Make();
+    AddOutput("XShape", "(Tensor)The output tensor.").AsIntermediate();
+  }
+};
+
+class Transpose2GradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("transpose2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Transpose2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      auto xshape_dim = ctx->GetInputDim("XShape");
+      auto x_shape_dim =
+          framework::slice_ddim(xshape_dim, 1, xshape_dim.size());
+      ctx->SetOutputDim(framework::GradVarName("X"), x_shape_dim);
+      ctx->ShareLoD("XShape", framework::GradVarName("X"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -120,8 +208,20 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
+
 REGISTER_OP_CPU_KERNEL(
     transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     transpose_grad,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
+                  ops::Transpose2GradMaker);
+REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    transpose2,
+    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc
index bcd1fb631394bc33b6fc162cfa7cbb20d55a654b..c1b5a8b31be243fab3af06a18c8e51986c953700 100644
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
@@ -21,3 +21,10 @@ REGISTER_OP_CUDA_KERNEL(
 REGISTER_OP_CUDA_KERNEL(
     transpose_grad,
     ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(
+    transpose2,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 0fc8d54f6400c9dfb6af1e764ed44e95195bfe6e..405943add238ac2d245df11127bfadb4899e855f 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -127,13 +127,13 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
         });
     AddComment(R"DOC(
     Unsqueeze Operator.
-    
-    Insert single-dimensional entries to the shape of a tensor. 
-    Takes one required argument axes, a list of dimensions that will be inserted. 
-    Dimension indices in axes are as seen in the output tensor. 
 
-    For example: 
-      Given a tensor such that tensor with shape [3, 4, 5], 
+    Insert single-dimensional entries to the shape of a tensor.
+    Takes one required argument axes, a list of dimensions that will be inserted.
+    Dimension indices in axes are as seen in the output tensor.
+
+    For example:
+      Given a tensor such that tensor with shape [3, 4, 5],
       then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
     )DOC");
   }
@@ -168,6 +168,112 @@ class UnsqueezeGradOp : public framework::OperatorBase {
   }
 };
 
+// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
+// unsqueeze, the XShape is used to carry the shape and lod of X which
+// will be used in unsqueeze_grad, in this way, the framework can reuse
+// the memory of X immediately the unsqueeze2_op is finished.
+// Considering compatibility issues, we could not fix unsqueeze2_op
+class Unsqueeze2OpInferShape : public UnsqueezeOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    UnsqueezeOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of Unsqueeze operator should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+
+class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
+ public:
+  void Make() override {
+    UnsqueezeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in UnsqueezeGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Unsqueeze2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = Unsqueeze2OpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape op.
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class Unsqueeze2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("unsqueeze2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Unsqueeze2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+
+class Unsqueeze2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
@@ -180,3 +286,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
                   ops::UnsqueezeGradInferShape);
+
+REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
+                  ops::Unsqueeze2OpInferShape, ops::Unsqueeze2GradOpMaker);
+REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
+                  ops::Unsqueeze2GradInferShape);
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 4fbfa6354ab45fed4839227a2a4be8fe147e5fd9..6a3ad2151081504fda2a3818c5f99ad47039d91d 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -121,6 +121,12 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
     if (nullptr == dso_handle) {
       LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
                    << dlerror() << ")";
+      if (dlPath.find("nccl") != std::string::npos) {
+        std::cout
+            << "You may need to install 'nccl2' from NVIDIA official website: "
+            << "https://developer.nvidia.com/nccl/nccl-download"
+            << "before install PaddlePaddle" << std::endl;
+      }
       dlPath = dso_name;
       dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
     }
diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h
index 4cc04b090519637ab0b8d3740b8a12f216218cae..32b7efc04c1f2ecc22f93c08387aec69ded4930a 100644
--- a/paddle/fluid/platform/macros.h
+++ b/paddle/fluid/platform/macros.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <cfloat>
 
 // Disable the copy and assignment operator for a class.
 #ifndef DISABLE_COPY_AND_ASSIGN
@@ -23,3 +24,7 @@ limitations under the License. */
   classname& operator=(const classname&) = delete; \
   classname& operator=(classname&&) = delete
 #endif
+
+#if defined(__FLT_MAX__)
+#define FLT_MAX __FLT_MAX__
+#endif  // __FLT_MAX__
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index cc46c88fd1f9a5d1bacad26beed6fd0af6405310..115abb98d56e633c938695c8127c832eab602110 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -100,14 +100,13 @@ struct NCCLContextMap {
       return;
     }
     std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
-    // if pass nccl_id here, can assume we are doing multi node training
-    if (nccl_id == nullptr) {
+    // if num_trainers == 1, should create a new nccl id for local comms.
+    if (num_trainers == 1) {
       std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
       PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
           comms.get(), static_cast<int>(order_.size()), order_.data()));
     } else {
-      PADDLE_ENFORCE_GT(num_trainers, 1);
-      // TODO(wuyi): need to ensure each node have same number of GPUs
+      PADDLE_ENFORCE_NOT_NULL(nccl_id);
       {
         int nranks = num_trainers * order_.size();
         NCCLGroupGuard gurad;
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index a0a2d29500e7afbe8a9a43f010d5fd2d0c560467..cf9f4aa95bc1cb79d95b79331fbc09e11af64194 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -14,24 +14,141 @@
 
 #pragma once
 
+#include <cstdio>
 #include <stdexcept>
+
+#include <memory>
 #include <string>
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "glog/logging.h"
+
 #if !defined(_WIN32)
-#include <dlfcn.h>     // for dladdr
-#include <execinfo.h>  // for backtrace
+#define UNUSED __attribute__((unused))
+#include <dlfcn.h>     //  dladdr
+#include <execinfo.h>  // backtrace
+#include <sys/stat.h>
+#include <algorithm>  // std::accumulate
 #else
-#include <Shlwapi.h>
-#include <Windows.h>
+#include <io.h>  // _popen, _pclose
+#include <windows.h>
+#if defined(_WIN32)
+#include <numeric>  // std::accumulate in msvc
+#endif
+// windows version of __attribute__((unused))
+#define UNUSED __pragma(warning(suppress : 4100))
 
-static void* dlsym(void* handle, const char* symbol_name) {
+#ifndef S_ISDIR  // windows port for sys/stat.h
+#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
+#endif  // S_ISDIR
+
+static void *dlsym(void *handle, const char *symbol_name) {
   FARPROC found_symbol;
   found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
 
   if (found_symbol == NULL) {
     throw std::runtime_error(std::string(symbol_name) + " not found.");
   }
-  return reinterpret_cast<void*>(found_symbol);
+  return reinterpret_cast<void *>(found_symbol);
 }
 
-#endif
+static void *dlopen(const char *filename, int flag) {
+  std::string file_name(filename);
+  file_name.replace(0, file_name.size() - 1, '/', '\\');
+  HMODULE hModule = LoadLibrary(file_name.c_str());
+  if (!hModule) {
+    throw std::runtime_error(file_name + " not found.");
+  }
+  return reinterpret_cast<void *>(hModule);
+}
+
+#endif  // !_WIN32
+
+static void ExecShellCommand(const std::string &cmd, std::string *message) {
+  char buffer[128];
+#if !defined(_WIN32)
+  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
+#else
+  std::shared_ptr<FILE> pipe(_popen(cmd.c_str(), "r"), _pclose);
+#endif  // _WIN32
+  if (!pipe) {
+    LOG(ERROR) << "error running command: " << cmd;
+    return;
+  }
+  while (!feof(pipe.get())) {
+    if (fgets(buffer, 128, pipe.get()) != nullptr) {
+      *message += buffer;
+    }
+  }
+}
+
+static bool PathExists(const std::string &path) {
+#if !defined(_WIN32)
+  struct stat statbuf;
+  if (stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+#else
+  struct _stat statbuf;
+  if (_stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+#endif  // !_WIN32
+  return false;
+}
+
+// TODO(yuyang18): If the functions below are needed by other files, move them
+// to paddle::filesystem namespace.
+#if !defined(_WIN32)
+constexpr char kSEP = '/';
+#else
+constexpr char kSEP = '\\';
+#endif  // _WIN32
+
+static bool FileExists(const std::string &filepath) {
+#if !defined(_WIN32)
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+#else
+  struct _stat buffer;
+  return (_stat(filepath.c_str(), &buffer) == 0);
+#endif  // !_WIN32
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static void MkDir(const char *path) {
+  std::string path_error(path);
+  path_error += " mkdir failed!";
+#if !defined(_WIN32)
+  if (mkdir(path, 0755)) {
+    if (errno != EEXIST) {
+      throw std::runtime_error(path_error);
+    }
+  }
+#else
+  CreateDirectory(path, NULL);
+  auto errorno = GetLastError();
+  if (errorno != ERROR_ALREADY_EXISTS) {
+    throw std::runtime_error(path_error);
+  }
+#endif  // !_WIN32
+}
+
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 7199424b4709fbe9fc962cf98aea6223b9f3e51d..ad095b92711dccb44f26748bcfa89a0b4123c6e7 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -115,6 +115,7 @@ function cmake_gen() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
         -DPY_VERSION=${PY_VERSION:-2.7}
     ========================================
@@ -144,6 +145,7 @@ EOF
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
+        -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
         -DPY_VERSION=${PY_VERSION:-2.7}
 }
@@ -498,7 +500,7 @@ EOF
 EOF
 
     if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.1.2-1+cuda${CUDA_MAJOR} libnccl-dev=2.1.2-1+cuda${CUDA_MAJOR} &&"
+        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&"
     else
         NCCL_DEPS=""
     fi
@@ -545,14 +547,14 @@ function gen_capi_package() {
         rm -rf $install_prefix
         make DESTDIR="$install_prefix" install
         cd $install_prefix/usr/local
-        ls | egrep -v "^Found.*item$" | xargs tar -cf ${PADDLE_ROOT}/build/paddle.tgz
+        ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz
     fi
 }
 
 function gen_fluid_inference_lib() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
-    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
         cat <<EOF
     ========================================
     Deploying fluid inference library ...
@@ -567,7 +569,7 @@ EOF
 }
 
 function test_fluid_inference_lib() {
-    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
         cat <<EOF
     ========================================
     Testing fluid inference library ...
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index 920dbf3b4ebb0bc3d98c9ea986d7d039deed4a4c..19fc229e6fa84792f58aeeb00be09eb2401b19c7 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -104,7 +104,7 @@ def batch_images_from_tar(data_file,
                 pickle.dump(
                     output,
                     open('%s/batch_%d' % (out_path, file_id), 'wb'),
-                    protocol=pickle.HIGHEST_PROTOCOL)
+                    protocol=2)
                 file_id += 1
                 data = []
                 labels = []
@@ -113,9 +113,7 @@ def batch_images_from_tar(data_file,
         output['label'] = labels
         output['data'] = data
         pickle.dump(
-            output,
-            open('%s/batch_%d' % (out_path, file_id), 'wb'),
-            protocol=pickle.HIGHEST_PROTOCOL)
+            output, open('%s/batch_%d' % (out_path, file_id), 'wb'), protocol=2)
 
     with open(meta_file, 'a') as meta:
         for file in os.listdir(out_path):
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index 3d2ef566173f81b29a6d8ea79cff00991a4ef3c4..a9b94a20720615dbfca97749463f27dbc88ac64f 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -98,10 +98,9 @@ class Inferencer(object):
             raise ValueError(
                 "inputs should be a map of {'input_name': input_var}")
 
-        with executor.scope_guard(self.scope):
-            results = self.exe.run(self.inference_program,
-                                   feed=inputs,
-                                   fetch_list=[self.predict_var],
+        with self._prog_and_scope_guard():
+            results = self.exe.run(feed=inputs,
+                                   fetch_list=[self.predict_var.name],
                                    return_numpy=return_numpy)
 
         return results
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 5757b2798e43dc70b406462a74b4f74eedcf56fa..1bc1dbbecaccd328d84cd3364a50c8f828d823c0 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -145,26 +145,23 @@ def rpn_target_assign(loc,
     """
 
     helper = LayerHelper('rpn_target_assign', **locals())
-    # 1. Compute the regression target bboxes
-    target_bbox = box_coder(
-        prior_box=anchor_box,
-        prior_box_var=anchor_var,
-        target_box=gt_box,
-        code_type='encode_center_size',
-        box_normalized=False)
-    # 2. Compute overlaps between the prior boxes and the gt boxes overlaps
+    # Compute overlaps between the prior boxes and the gt boxes overlaps
     iou = iou_similarity(x=gt_box, y=anchor_box)
-    # 3. Assign target label to anchors
-    loc_index = helper.create_tmp_variable(dtype=anchor_box.dtype)
-    score_index = helper.create_tmp_variable(dtype=anchor_box.dtype)
-    target_label = helper.create_tmp_variable(dtype=anchor_box.dtype)
+    # Assign target label to anchors
+    loc_index = helper.create_tmp_variable(dtype='int32')
+    score_index = helper.create_tmp_variable(dtype='int32')
+    target_label = helper.create_tmp_variable(dtype='int64')
+    target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype)
     helper.append_op(
         type="rpn_target_assign",
-        inputs={'DistMat': iou},
+        inputs={'Anchor': anchor_box,
+                'GtBox': gt_box,
+                'DistMat': iou},
         outputs={
             'LocationIndex': loc_index,
             'ScoreIndex': score_index,
-            'TargetLabel': target_label
+            'TargetLabel': target_label,
+            'TargetBBox': target_bbox,
         },
         attrs={
             'rpn_batch_size_per_im': rpn_batch_size_per_im,
@@ -173,16 +170,16 @@ def rpn_target_assign(loc,
             'fg_fraction': fg_fraction
         })
 
-    # 4. Reshape and gather the target entry
-    scores = nn.reshape(x=scores, shape=(-1, 2))
-    loc = nn.reshape(x=loc, shape=(-1, 4))
-    target_label = nn.reshape(x=target_label, shape=(-1, 1))
-    target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4))
+    loc_index.stop_gradient = True
+    score_index.stop_gradient = True
+    target_label.stop_gradient = True
+    target_bbox.stop_gradient = True
 
+    scores = nn.reshape(x=scores, shape=(-1, 1))
+    loc = nn.reshape(x=loc, shape=(-1, 4))
     predicted_scores = nn.gather(scores, score_index)
     predicted_location = nn.gather(loc, loc_index)
-    target_label = nn.gather(target_label, score_index)
-    target_bbox = nn.gather(target_bbox, loc_index)
+
     return predicted_scores, predicted_location, target_label, target_bbox
 
 
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 0182bbeb637ec7b6a341a4822a1cc5fb5aef077d..b1598bfec210474ae1e17f9f88e8b57aa80b8452 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -78,7 +78,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
     return acc_out
 
 
-def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
+def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
     """
     **Area Under the Curve (AUC) Layer**
 
@@ -118,16 +118,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
     """
     helper = LayerHelper("auc", **locals())
     auc_out = helper.create_tmp_variable(dtype="float64")
+    batch_auc_out = helper.create_tmp_variable(dtype="float64")
     # make tp, tn, fp, fn persistable, so that can accumulate all batches.
-    tp = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
-    tn = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
-    fp = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
-    fn = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
-    for var in [tp, tn, fp, fn]:
+    stat_pos = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+    stat_neg = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+
+    for var in [stat_pos, stat_neg]:
         helper.set_variable_initializer(
             var, Constant(
                 value=0.0, force_cpu=True))
@@ -137,18 +135,15 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
         inputs={
             "Predict": [input],
             "Label": [label],
-            "TP": [tp],
-            "TN": [tn],
-            "FP": [fp],
-            "FN": [fn]
+            "StatPos": [stat_pos],
+            "StatNeg": [stat_neg]
         },
         attrs={"curve": curve,
                "num_thresholds": num_thresholds},
         outputs={
             "AUC": [auc_out],
-            "TPOut": [tp],
-            "TNOut": [tn],
-            "FPOut": [fp],
-            "FNOut": [fn]
+            "BatchAUC": [batch_auc_out],
+            "StatPosOut": [stat_pos],
+            "StatNegOut": [stat_neg]
         })
-    return auc_out, [tp, tn, fp, fn]
+    return auc_out, batch_auc_out, [stat_pos, stat_neg]
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8ef7444a1a3b8ffd38c177ad943c008bd0aaf084..8408e6d2a12edacb310ed5eb543ad51585f3d82a 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -109,7 +109,9 @@ __all__ = [
     'flatten',
     'sequence_mask',
     'stack',
+    'pad2d',
     'unstack',
+    'sequence_enumerate',
 ]
 
 
@@ -3544,11 +3546,6 @@ def topk(input, k, name=None):
 
             top5_values, top5_indices = layers.topk(input, k=5)
     """
-    shape = input.shape
-    if k < 1 or k >= shape[-1]:
-        raise ValueError("k must be greater than 0 and less than %d." %
-                         (shape[-1]))
-
     helper = LayerHelper("top_k", **locals())
     values = helper.create_tmp_variable(dtype=input.dtype)
     indices = helper.create_tmp_variable(dtype="int64")
@@ -4028,10 +4025,12 @@ def transpose(x, perm, name=None):
 
     helper = LayerHelper('transpose', **locals())
     out = helper.create_tmp_variable(x.dtype)
+    x_shape = helper.create_tmp_variable(x.dtype)
     helper.append_op(
-        type='transpose',
+        type='transpose2',
         inputs={'X': [x]},
-        outputs={'Out': [out]},
+        outputs={'Out': [out],
+                 'XShape': [x_shape]},
         attrs={'axis': perm})
     return out
 
@@ -4501,7 +4500,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
     """
 
     if not (isinstance(shape, list) or isinstance(shape, tuple)):
-        raise ValueError("Input shape must be a python lsit or tuple.")
+        raise ValueError("Input shape must be a python list or tuple.")
     inputs = {"X": x}
     if isinstance(actual_shape, Variable):
         inputs["Shape"] = actual_shape
@@ -4523,13 +4522,15 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                 "Each dimension size given in shape must not be negtive "
                 "except one unknown dimension.")
 
-    helper = LayerHelper("reshape", **locals())
+    helper = LayerHelper("reshape2", **locals())
     out = helper.create_tmp_variable(dtype=x.dtype)
+    x_shape = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
-        type="reshape",
+        type="reshape2",
         inputs=inputs,
         attrs={"shape": shape},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})
 
     return helper.append_activation(out)
 
@@ -4573,11 +4574,13 @@ def squeeze(input, axes, name=None):
     """
     helper = LayerHelper("squeeze", **locals())
     out = helper.create_tmp_variable(dtype=input.dtype)
+    x_shape = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
-        type="squeeze",
+        type="squeeze2",
         inputs={"X": input},
         attrs={"axes": axes},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})
 
     return out
 
@@ -4608,11 +4611,13 @@ def unsqueeze(input, axes, name=None):
     """
     helper = LayerHelper("unsqueeze", **locals())
     out = helper.create_tmp_variable(dtype=input.dtype)
+    x_shape = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
-        type="unsqueeze",
+        type="unsqueeze2",
         inputs={"X": input},
         attrs={"axes": axes},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})
 
     return out
 
@@ -5614,6 +5619,94 @@ def rank_loss(label, left, right, name=None):
     return out
 
 
+def pad2d(input,
+          paddings=[0, 0, 0, 0],
+          mode='constant',
+          pad_value=0.0,
+          data_format="NCHW",
+          name=None):
+    """
+    Pad 2-d images accordding to 'paddings' and 'mode'.
+    If mode is 'reflect', paddings[0] and paddings[1] must be no greater
+    than height-1. And the width dimension has the same condition.
+
+    Example:
+
+      Given that X is a channel of image from input:
+      
+      X = [[1, 2, 3],
+           [4, 5, 6]]
+      
+      Case 0:
+      
+        paddings = [0, 1, 2, 3],
+        mode = 'constant'
+        pad_value = 0
+        
+        Out = [[0, 0, 1, 2, 3, 0, 0, 0]
+               [0, 0, 4, 5, 6, 0, 0, 0]
+               [0, 0, 0, 0, 0, 0, 0, 0]]
+      
+      Case 1:
+      
+        paddings = [0, 1, 2, 1],
+        mode = 'reflect'
+        
+        Out = [[3, 2, 1, 2, 3, 2]
+               [6, 5, 4, 5, 6, 5]
+               [3, 2, 1, 2, 3, 2]]
+        
+      Case 2:
+      
+        paddings = [0, 1, 2, 1],
+        mode = 'edge'
+        
+        Out = [[1, 1, 1, 2, 3, 3]
+               [4, 4, 4, 5, 6, 6]
+               [4, 4, 4, 5, 6, 6]]
+    
+  
+    Args:
+        input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format.
+        paddings (tuple|list): The padding size. If padding is a tuple, it must
+            contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
+            Default: padding = [0, 0, 0, 0].
+        mode (str): Three modes: constant(default), reflect, edge. Default: constant
+        pad_value (float32): The value to fill the padded areas in constant mode. Default: 0
+        data_format (str): An optional string from: "NHWC", "NCHW". Specify the data format of
+                           the input data.
+                           Default: "NCHW"
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The tensor variable padded accordding to paddings and mode.
+
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          result = fluid.layers.pad2d(input=data, padding=[1,2,3,4], mode='reflect')
+    """
+
+    helper = LayerHelper('pad2d', **locals())
+    dtype = helper.input_dtype(input_param_name='input')
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='pad2d',
+        inputs={'X': input},
+        outputs={"Out": out},
+        attrs={
+            'paddings': paddings,
+            'mode': mode,
+            'pad_value': pad_value,
+            'data_frmat': data_format
+        })
+
+    return out
+
+
 def prelu(x, mode, param_attr=None, name=None):
     """
     Equation:
@@ -5628,8 +5721,8 @@ def prelu(x, mode, param_attr=None, name=None):
 		       all: all elements share same weight
  		       channel:elements in a channel share same weight
  		       element:each element has a weight
-	  name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
+	name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically. 
 
     Returns:
         Variable: The output tensor with the same shape as input.
@@ -5726,14 +5819,61 @@ def flatten(x, axis=1, name=None):
         raise ValueError("The axis should be a int, and in range [0, rank(x)]")
 
     out = helper.create_tmp_variable(x.dtype)
+    x_shape = helper.create_tmp_variable(x.dtype)
     helper.append_op(
-        type='flatten',
+        type='flatten2',
         inputs={"X": x},
-        outputs={'Out': out},
+        outputs={'Out': out,
+                 'XShape': x_shape},
         attrs={"axis": axis})
     return out
 
 
+def sequence_enumerate(input, win_size, pad_value=0, name=None):
+    """
+    Generate a new sequence for the input index sequence, which enumerates all the
+    sub-sequences with length `win_size` of the input. 
+    The enumerated sequence has the same 1st dimension with variable `input`, and
+    the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
+    
+    Examples:
+    Case 1:
+      Input:
+        X.lod = [[0, 3, 5]]
+        X.data = [[1], [2], [3], [4], [5]]
+        X.dims = [5, 1]
+      Attrs:
+        win_size = 2
+        pad_value = 0
+      Output:
+        Out.lod = [[0, 3, 5]]
+        Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
+        Out.dims = [5, 2]
+
+    Args:
+        input (Variable): The input variable which is a index sequence.
+        win_size (int): The window size for enumerating all sub-sequences.
+        pad_value (int): The padding value, default 0.
+
+    Returns:
+        Variable: The enumerate sequence variable which is a LoDTensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1)
+            out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
+    """
+    helper = LayerHelper('sequence_enumerate', **locals())
+    out = helper.create_tmp_variable(helper.input_dtype(), stop_gradient=True)
+    helper.append_op(
+        type='sequence_enumerate',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'win_size': win_size,
+               'pad_value': pad_value})
+
+
 def sequence_mask(x, maxlen=None, dtype='int64', name=None):
     """
     **SequenceMask Layer**
@@ -5813,6 +5953,7 @@ def stack(x, axis=0):
     helper.append_op(
         type='stack', inputs={'X': x}, outputs={'Y': out},
         attrs={'axis': axis})
+
     return out
 
 
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 592cb23eb9319658f8542ed5bc6ab3e95cfdb118..0c2800dcf35ed156b71625babea2724f520575e5 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -558,8 +558,6 @@ class Auc(MetricBase):
         name: metric name
         curve: Specifies the name of the curve to be computed, 'ROC' [default] or
           'PR' for the Precision-Recall-curve.
-        num_thresholds: The number of thresholds to use when discretizing the roc
-            curve.
 
     "NOTE: only implement the ROC curve type via Python now."
 
@@ -574,15 +572,14 @@ class Auc(MetricBase):
                 numpy_auc = metric.eval()
     """
 
-    def __init__(self, name, curve='ROC', num_thresholds=200):
+    def __init__(self, name, curve='ROC', num_thresholds=4095):
         super(Auc, self).__init__(name=name)
         self._curve = curve
         self._num_thresholds = num_thresholds
-        self._epsilon = 1e-6
-        self.tp_list = np.zeros((num_thresholds, ))
-        self.fn_list = np.zeros((num_thresholds, ))
-        self.tn_list = np.zeros((num_thresholds, ))
-        self.fp_list = np.zeros((num_thresholds, ))
+
+        _num_pred_buckets = num_thresholds + 1
+        self._stat_pos = [0] * _num_pred_buckets
+        self._stat_neg = [0] * _num_pred_buckets
 
     def update(self, preds, labels):
         if not _is_numpy_(labels):
@@ -590,41 +587,32 @@ class Auc(MetricBase):
         if not _is_numpy_(preds):
             raise ValueError("The 'predictions' must be a numpy ndarray.")
 
-        kepsilon = 1e-7  # to account for floating point imprecisions
-        thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1)
-                      for i in range(self._num_thresholds - 2)]
-        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
-
-        # calculate TP, FN, TN, FP count
-        for idx_thresh, thresh in enumerate(thresholds):
-            tp, fn, tn, fp = 0, 0, 0, 0
-            for i, lbl in enumerate(labels):
-                if lbl:
-                    if preds[i, 1] >= thresh:
-                        tp += 1
-                    else:
-                        fn += 1
-                else:
-                    if preds[i, 1] >= thresh:
-                        fp += 1
-                    else:
-                        tn += 1
-            self.tp_list[idx_thresh] += tp
-            self.fn_list[idx_thresh] += fn
-            self.tn_list[idx_thresh] += tn
-            self.fp_list[idx_thresh] += fp
+        for i, lbl in enumerate(labels):
+            value = preds[i, 1]
+            bin_idx = int(value * self._num_thresholds)
+            assert bin_idx <= self._num_thresholds
+            if lbl:
+                self._stat_pos[bin_idx] += 1.0
+            else:
+                self._stat_neg[bin_idx] += 1.0
+
+    @staticmethod
+    def trapezoid_area(x1, x2, y1, y2):
+        return abs(x1 - x2) * (y1 + y2) / 2.0
 
     def eval(self):
-        epsilon = self._epsilon
-        num_thresholds = self._num_thresholds
-        tpr = (self.tp_list.astype("float32") + epsilon) / (
-            self.tp_list + self.fn_list + epsilon)
-        fpr = self.fp_list.astype("float32") / (
-            self.fp_list + self.tn_list + epsilon)
-        rec = (self.tp_list.astype("float32") + epsilon) / (
-            self.tp_list + self.fp_list + epsilon)
-
-        x = fpr[:num_thresholds - 1] - fpr[1:]
-        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
-        auc_value = np.sum(x * y)
-        return auc_value
+        tot_pos = 0.0
+        tot_neg = 0.0
+        auc = 0.0
+
+        idx = self._num_thresholds
+        while idx >= 0:
+            tot_pos_prev = tot_pos
+            tot_neg_prev = tot_neg
+            tot_pos += self._stat_pos[idx]
+            tot_neg += self._stat_neg[idx]
+            auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos,
+                                       tot_pos_prev)
+            idx -= 1
+
+        return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 33d6311b9717c66f0d6782eb6b3e348cd4c02a69..215f0cf2fc5ab4fbd06719ac4790a01dd00080eb 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -897,7 +897,20 @@ class RMSPropOptimizer(Optimizer):
 
         r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
 
-        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) +
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w & = w - v(w, t)
+
+    if centered is True:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
+
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
             \\epsilon}} \\nabla Q_{i}(w)
 
         w & = w - v(w, t)
@@ -915,6 +928,10 @@ class RMSPropOptimizer(Optimizer):
             avoid division by zero, set 1e-6 by default.
         momentum(float): :math:`\\beta` in equation is the momentum term,
             set 0.0 by default.
+        centered(bool): If True, gradients are normalized by the estimated variance of
+            the gradient; if False, by the uncentered second moment. Setting this to
+            True may help with training, but is slightly more expensive in terms of
+            computation and memory. Defaults to False.
 
     Raises:
         ValueError: If learning_rate, rho, epsilon, momentum are None.
@@ -928,12 +945,14 @@ class RMSPropOptimizer(Optimizer):
 
     _momentum_acc_str = "momentum"
     _mean_square_acc_str = "mean_square"
+    _mean_grad_acc_str = "mean_grad"
 
     def __init__(self,
                  learning_rate,
                  rho=0.95,
                  epsilon=1.0e-6,
                  momentum=0.0,
+                 centered=False,
                  **kwargs):
         super(RMSPropOptimizer, self).__init__(
             learning_rate=learning_rate, **kwargs)
@@ -950,6 +969,7 @@ class RMSPropOptimizer(Optimizer):
         self._rho = rho
         self._epsilon = epsilon
         self._momentum = momentum
+        self._centered = centered
 
     def _create_accumulators(self, block, parameters):
         if not isinstance(block, framework.Block):
@@ -958,6 +978,7 @@ class RMSPropOptimizer(Optimizer):
         for p in parameters:
             self._add_accumulator(self._momentum_acc_str, p)
             self._add_accumulator(self._mean_square_acc_str, p)
+            self._add_accumulator(self._mean_grad_acc_str, p)
 
     def _append_optimize_op(self, block, param_and_grad):
         if not isinstance(block, framework.Block):
@@ -967,6 +988,8 @@ class RMSPropOptimizer(Optimizer):
                                              param_and_grad[0])
         mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
                                                 param_and_grad[0])
+        mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
+                                              param_and_grad[0])
         rmsprop_op = block.append_op(
             type=self.type,
             inputs={
@@ -974,17 +997,20 @@ class RMSPropOptimizer(Optimizer):
                 "Grad": param_and_grad[1],
                 "Moment": momentum_acc,
                 "MeanSquare": mean_square_acc,
+                "MeanGrad": mean_grad_acc,
                 "LearningRate": self._create_param_lr(param_and_grad),
             },
             outputs={
                 "ParamOut": param_and_grad[0],
                 "MomentOut": momentum_acc,
-                "MeanSquareOut": mean_square_acc
+                "MeanSquareOut": mean_square_acc,
+                "MeanGradOut": mean_grad_acc
             },
             attrs={
                 "epsilon": self._epsilon,
                 "decay": self._rho,
-                "momentum": self._momentum
+                "momentum": self._momentum,
+                "centered": self._centered
             })
 
         return rmsprop_op
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index a7765c9591f0bd653c08036c46a36131906a758f..4790e0f6119e96b11b049bfdd3b46d40a382683b 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -43,8 +43,9 @@ class ParallelExecutor(object):
         num_trainers(int): If greater than 1, NCCL will be initialized with
             multiple rank of nodes, each node should have same number of GPUs.
             Distributed training will be enabled then. Default 1.
-        trainer_id(int: Must use together with num_trainers. trainer_id is the
+        trainer_id(int): Must use together with num_trainers. trainer_id is the
             "rank" of current node starts from 0. Default 0.
+        scope(Scope): scope to run with, default use fluid.global_scope().
 
     Returns:
         ParallelExecutor: The initialized ParallelExecutor object.
@@ -73,6 +74,7 @@ class ParallelExecutor(object):
                  build_strategy=None,
                  num_trainers=1,
                  trainer_id=0,
+                 scope=None,
                  **kwargs):
         if len(kwargs) != 0:
             err_msg = ""
@@ -131,7 +133,8 @@ class ParallelExecutor(object):
 
         main = main_program
         main = main if main else framework.default_main_program()
-        scope = executor.global_scope()
+        if scope == None:
+            scope = executor.global_scope()
         # FIXME(Yancey1989): it's a temporary approach to determinate the distribute
         # train program, call self.bcast_param() at the end of each mini-batch.
         self.is_dist = True if "recv" in [
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
index f6017a455df7e8bd197ef2563a759f843b5e7c73..e1368a3392a9cab3e82eff0a73eb225a52aa03bf 100644
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -47,14 +47,14 @@ def train_program():
     loss = fluid.layers.square_error_cost(input=y_predict, label=y)
     avg_loss = fluid.layers.mean(loss)
 
-    return avg_loss
+    return [avg_loss, y_predict]
 
 
 def optimizer_func():
     return fluid.optimizer.SGD(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, params_dirname, inference_model_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     trainer = fluid.Trainer(
@@ -74,6 +74,8 @@ def train(use_cuda, train_program, params_dirname):
                 '''
                 if params_dirname is not None:
                     trainer.save_params(params_dirname)
+                    trainer.save_inference_model(inference_model_dirname,
+                                                 ['x'], [1])
                 trainer.stop()
 
     trainer.train(
@@ -99,15 +101,55 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results[0])
 
 
+def infer_by_saved_model(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        # The input's dimension should be 2-D and the second dim is 13
+        # The input data should be >= 0
+        batch_size = 10
+
+        test_reader = paddle.batch(
+            paddle.dataset.uci_housing.test(), batch_size=batch_size)
+
+        test_data = next(test_reader())
+        test_feat = numpy.array(
+            [data[0] for data in test_data]).astype("float32")
+        test_label = numpy.array(
+            [data[1] for data in test_data]).astype("float32")
+
+        assert feed_target_names[0] == 'x'
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: numpy.array(test_feat)},
+                          fetch_list=fetch_targets)
+        print("infer shape: ", results[0].shape)
+        print("infer results: ", results[0])
+        print("ground truth: ", test_label)
+
+
 def main(use_cuda):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
     # Directory for saving the trained model
-    params_dirname = "fit_a_line.inference.model"
+    params_dirname = "fit_a_line.model"
+    inference_model_dirname = "fit_a_line.inference_model"
 
-    train(use_cuda, train_program, params_dirname)
+    train(use_cuda, train_program, params_dirname, inference_model_dirname)
     infer(use_cuda, inference_program, params_dirname)
+    infer_by_saved_model(use_cuda, inference_model_dirname)
 
 
 class TestFitALine(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index be494a0d340c62fb35afbf97fba38eff08a965e6..e5ae95e2d943917b9bc10f0d4c4bdc5f8fb07fdb 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -16,7 +16,10 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy
+import six
+import os
 import cifar10_small_test_set
 
 
@@ -89,7 +92,7 @@ def optimizer_func():
     return fluid.optimizer.Adam(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, parallel, params_dirname):
     BATCH_SIZE = 128
     EPOCH_NUM = 1
 
@@ -116,7 +119,10 @@ def train(use_cuda, train_program, params_dirname):
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_func=train_program, optimizer_func=optimizer_func, place=place)
+        train_func=train_program,
+        optimizer_func=optimizer_func,
+        place=place,
+        parallel=parallel)
 
     trainer.train(
         reader=train_reader,
@@ -125,10 +131,13 @@ def train(use_cuda, train_program, params_dirname):
         feed_order=['pixel', 'label'])
 
 
-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -139,22 +148,37 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results)
 
 
-def main(use_cuda):
+def main(use_cuda, parallel):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
     save_path = "image_classification_resnet.inference.model"
 
+    os.environ['CPU_NUM'] = str(4)
     train(
         use_cuda=use_cuda,
         train_program=train_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)
 
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+
+    os.environ['CPU_NUM'] = str(1)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)
 
 
 if __name__ == '__main__':
     for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            # TODO(minqiyang): remove this line after fixing the deletion
+            # order problem of Scope in ParallelExecutor in manylinux
+            if six.PY2:
+                main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index dbc7bc06c93157f271c79e85b6925468e861e57f..ff91be72c918f8dac65b7030e45c4a00deb965ac 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -16,7 +16,10 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy
+import six
+import os
 import cifar10_small_test_set
 
 
@@ -68,7 +71,7 @@ def optimizer_func():
     return fluid.optimizer.Adam(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, parallel, params_dirname):
     BATCH_SIZE = 128
     train_reader = paddle.batch(
         paddle.reader.shuffle(
@@ -93,7 +96,10 @@ def train(use_cuda, train_program, params_dirname):
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer_func=optimizer_func)
+        train_func=train_program,
+        place=place,
+        optimizer_func=optimizer_func,
+        parallel=parallel)
 
     trainer.train(
         reader=train_reader,
@@ -102,10 +108,13 @@ def train(use_cuda, train_program, params_dirname):
         feed_order=['pixel', 'label'])
 
 
-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -116,22 +125,34 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results)
 
 
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
+def main(use_cuda, parallel):
     save_path = "image_classification_vgg.inference.model"
 
+    os.environ['CPU_NUM'] = str(4)
     train(
         use_cuda=use_cuda,
         train_program=train_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)
 
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+    os.environ['CPU_NUM'] = str(1)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)
 
 
 if __name__ == '__main__':
     for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            # TODO(minqiyang): remove this line after fixing the deletion
+            # order problem of Scope in ParallelExecutor in manylinux
+            if six.PY2:
+                main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index 187bef1b0c1a614fbca88ef22097831d7bd5cd7f..fa72c939e57356f26d60032dd0a91c894b28c505 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -18,6 +18,7 @@ import argparse
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle
+import six
 import sys
 import numpy
 import unittest
@@ -64,14 +65,14 @@ def optimizer_func():
     return fluid.optimizer.Adam(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, parallel, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     trainer = fluid.Trainer(
         train_func=train_program,
         place=place,
         optimizer_func=optimizer_func,
-        parallel=True)
+        parallel=parallel)
 
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
@@ -108,11 +109,14 @@ def train(use_cuda, train_program, params_dirname):
         feed_order=['img', 'label'])
 
 
-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)
 
     batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -123,20 +127,35 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results[0])
 
 
-def main(use_cuda):
+def main(use_cuda, parallel):
     params_dirname = "recognize_digits_conv.inference.model"
 
     # call train() with is_local argument to run distributed train
+    os.environ['CPU_NUM'] = str(4)
     train(
         use_cuda=use_cuda,
         train_program=train_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
+
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+    os.environ['CPU_NUM'] = str(1)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
 
 
 if __name__ == '__main__':
-    # for use_cuda in (False, True):
-    main(use_cuda=core.is_compiled_with_cuda())
+    for use_cuda in (False, True):
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            # TODO(minqiyang): remove this line after fixing the deletion
+            # order problem of Scope in ParallelExecutor in manylinux
+            if six.PY2:
+                main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
index b95e7db122adbb1414da1691926c920b963fd6fe..440d2a30835cb89089709f024a4dcc6e4113efa8 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -16,7 +16,9 @@ from __future__ import print_function
 
 import argparse
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import paddle
+import six
 import sys
 import numpy
 import unittest
@@ -50,11 +52,14 @@ def optimizer_func():
     return fluid.optimizer.Adam(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, params_dirname, parallel):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer_func=optimizer_func)
+        train_func=train_program,
+        place=place,
+        optimizer_func=optimizer_func,
+        parallel=parallel)
 
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
@@ -86,11 +91,14 @@ def train(use_cuda, train_program, params_dirname):
         feed_order=['img', 'label'])
 
 
-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)
 
     batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -101,20 +109,35 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results[0])
 
 
-def main(use_cuda):
+def main(use_cuda, parallel):
     params_dirname = "recognize_digits_mlp.inference.model"
 
     # call train() with is_local argument to run distributed train
+    os.environ['CPU_NUM'] = str(4)
     train(
         use_cuda=use_cuda,
         train_program=train_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
+
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+    os.environ['CPU_NUM'] = str(1)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
 
 
 if __name__ == '__main__':
-    # for use_cuda in (False, True):
-    main(use_cuda=False)
+    for use_cuda in (False, True):
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            # TODO(minqiyang): remove this line after fixing the deletion
+            # order problem of Scope in ParallelExecutor in manylinux
+            if six.PY2:
+                main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index ec0bf3ff8d64345111537780aaa5367ed0e1f8ff..e2564763d19d180f7c6933429dddf58c77be7bb8 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -281,7 +281,7 @@ class TestRpnTargetAssign(unittest.TestCase):
             gt_box = layers.data(
                 name='gt_box', shape=[4], lod_level=1, dtype='float32')
 
-            predicted_scores, predicted_location, target_label, target_bbox = layers.rpn_target_assign(
+            pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign(
                 loc=loc,
                 scores=scores,
                 anchor_box=anchor_box,
@@ -292,15 +292,13 @@ class TestRpnTargetAssign(unittest.TestCase):
                 rpn_positive_overlap=0.7,
                 rpn_negative_overlap=0.3)
 
-            self.assertIsNotNone(predicted_scores)
-            self.assertIsNotNone(predicted_location)
-            self.assertIsNotNone(target_label)
-            self.assertIsNotNone(target_bbox)
-            assert predicted_scores.shape[1] == 2
-            assert predicted_location.shape[1] == 4
-            assert predicted_location.shape[1] == target_bbox.shape[1]
-
-        print(str(program))
+            self.assertIsNotNone(pred_scores)
+            self.assertIsNotNone(pred_loc)
+            self.assertIsNotNone(tgt_lbl)
+            self.assertIsNotNone(tgt_bbox)
+            assert pred_scores.shape[1] == 1
+            assert pred_loc.shape[1] == 4
+            assert pred_loc.shape[1] == tgt_bbox.shape[1]
 
 
 class TestGenerateProposals(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index 7abfa0a4be0dec9fe251704e22dfef1f932e7c5b..e3db316698398ff693157d583ad1410d10dcf81d 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -36,6 +36,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import core
 from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.compat as cpt
 from paddle.compat import long_type
 
 import hashlib
@@ -315,8 +316,9 @@ def pad_batch_data(insts,
     """
     return_list = []
     max_len = max(len(inst) for inst in insts)
-    num_token = reduce(lambda x, y: x + y,
-                       [len(inst) for inst in insts]) if return_num_token else 0
+    num_token = six.moves.reduce(
+        lambda x, y: x + y,
+        [len(inst) for inst in insts]) if return_num_token else 0
     # Any token included in dict can be used to pad, since the paddings' loss
     # will be masked out by weights and make no effect on parameter gradients.
     inst_data = np.array(
@@ -328,7 +330,7 @@ def pad_batch_data(insts,
         return_list += [inst_weight.astype("float32").reshape([-1, 1])]
     else:  # position data
         inst_pos = np.array([
-            range(1, len(inst) + 1) + [0] * (max_len - len(inst))
+            list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
             for inst in insts
         ])
         return_list += [inst_pos.astype("int64").reshape([-1, 1])]
@@ -385,10 +387,11 @@ def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx,
         return_num_token=True)
 
     data_input_dict = dict(
-        zip(data_input_names, [
-            src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
-            trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
-        ]))
+        list(
+            zip(data_input_names, [
+                src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
+                trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+            ])))
     return data_input_dict, np.asarray([num_token], dtype="float32")
 
 
@@ -561,7 +564,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
                         np.log(TrainTaskConfig.label_smooth_eps / (
                             ModelHyperParams.trg_vocab_size - 1) + 1e-20))
     init = False
-    for pass_id in xrange(TrainTaskConfig.pass_num):
+    for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
         pass_start_time = time.time()
         for batch_id, data in enumerate(train_data()):
             if batch_id >= 5:
@@ -587,11 +590,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
                     ModelHyperParams.eos_idx, ModelHyperParams.n_head,
                     ModelHyperParams.d_model)
                 total_num_token += num_token
-                feed_kv_pairs = data_input_dict.items()
+                feed_kv_pairs = list(data_input_dict.items())
                 if TrainTaskConfig.local:
-                    feed_kv_pairs += {
+                    feed_kv_pairs += list({
                         lr_scheduler.learning_rate.name: lr_rate
-                    }.items()
+                    }.items())
                 feed_list.append(dict(feed_kv_pairs))
 
                 if not init:
@@ -873,6 +876,7 @@ class DataReader(object):
 
             f = tarfile.open(fpaths[0], "r")
             for line in f.extractfile(tar_fname):
+                line = cpt.to_text(line)
                 fields = line.strip("\n").split(self._field_delimiter)
                 if (not self._only_src and len(fields) == 2) or (
                         self._only_src and len(fields) == 1):
@@ -882,8 +886,9 @@ class DataReader(object):
                 if not os.path.isfile(fpath):
                     raise IOError("Invalid file: %s" % fpath)
 
-                with open(fpath, "r") as f:
+                with open(fpath, "rb") as f:
                     for line in f:
+                        line = cpt.to_text(line)
                         fields = line.strip("\n").split(self._field_delimiter)
                         if (not self._only_src and len(fields) == 2) or (
                                 self._only_src and len(fields) == 1):
@@ -892,8 +897,9 @@ class DataReader(object):
     @staticmethod
     def load_dict(dict_path, reverse=False):
         word_dict = {}
-        with open(dict_path, "r") as fdict:
+        with open(dict_path, "rb") as fdict:
             for idx, line in enumerate(fdict):
+                line = cpt.to_text(line)
                 if reverse:
                     word_dict[idx] = line.strip("\n")
                 else:
@@ -1034,7 +1040,7 @@ def multi_head_attention(queries,
         # size of the input as the output dimension size.
         return layers.reshape(
             x=trans_x,
-            shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]]))
+            shape=list(map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])))
 
     def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
         """
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 20f1a37a426e9697048d636bf738c9056213e5f6..56a242b996f67aa4b9c858ab8aaeb1c1cd3bcf60 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -249,7 +249,7 @@ class OpTest(unittest.TestCase):
         outs, _ = self._calc_output(place)
         return outs
 
-    def _calc_output(self, place, parallel=False):
+    def _calc_output(self, place, parallel=False, no_check_set=None):
 
         program = Program()
         block = program.global_block()
@@ -273,6 +273,8 @@ class OpTest(unittest.TestCase):
         # if not, fill the fetch_list by the user configured outputs in test.
         if len(fetch_list) == 0:
             for var_name, var in six.iteritems(outputs):
+                if no_check_set is not None and var_name in no_check_set:
+                    continue
                 if isinstance(var, list):
                     for v in var:
                         fetch_list.append(v)
@@ -291,11 +293,17 @@ class OpTest(unittest.TestCase):
                             return_numpy=False)
         return outs, fetch_list
 
-    def check_output_with_place(self, place, atol):
-        outs, fetch_list = self._calc_output(place)
+    def check_output_with_place(self,
+                                place,
+                                atol,
+                                no_check_set=None,
+                                equal_nan=False):
+        outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
             if out_name not in self.outputs:
                 continue
+            if no_check_set is not None and out_name in no_check_set:
+                continue
 
             def find_actual(target_name, fetch_list):
                 found = [
@@ -321,7 +329,7 @@ class OpTest(unittest.TestCase):
                         if isinstance(expect, tuple) else expect
                     self.assertTrue(
                         np.allclose(
-                            actual_t, expect_t, atol=atol),
+                            actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                         "Output (" + sub_out_name + ") has diff at " +
                         str(place))
                     if isinstance(expect, tuple):
@@ -337,7 +345,7 @@ class OpTest(unittest.TestCase):
                 expect_t = expect[0] if isinstance(expect, tuple) else expect
                 self.assertTrue(
                     np.allclose(
-                        actual_t, expect_t, atol=atol),
+                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                     "Output (" + out_name + ") has diff at " + str(place) +
                     "\nExpect " + str(expect_t) + "\n" + "But Got" +
                     str(actual_t))
@@ -360,10 +368,10 @@ class OpTest(unittest.TestCase):
             places.append(core.CUDAPlace(0))
         return places
 
-    def check_output(self, atol=1e-5):
+    def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False):
         places = self._get_places()
         for place in places:
-            self.check_output_with_place(place, atol)
+            self.check_output_with_place(place, atol, no_check_set, equal_nan)
 
     def check_output_customized(self, checker):
         places = self._get_places()
diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py
index 5393a17e674a3cad6d705a1ff7a45320e644af94..1de4a9d016a177944253d12094722d3a05614be2 100644
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
@@ -26,18 +26,15 @@ class TestAucOp(OpTest):
         pred = np.random.random((128, 2)).astype("float32")
         labels = np.random.randint(0, 2, (128, 1))
         num_thresholds = 200
-        tp = np.zeros((num_thresholds, )).astype("int64")
-        tn = np.zeros((num_thresholds, )).astype("int64")
-        fp = np.zeros((num_thresholds, )).astype("int64")
-        fn = np.zeros((num_thresholds, )).astype("int64")
+
+        stat_pos = np.zeros((num_thresholds + 1, )).astype("int64")
+        stat_neg = np.zeros((num_thresholds + 1, )).astype("int64")
 
         self.inputs = {
             'Predict': pred,
             'Label': labels,
-            'TP': tp,
-            'TN': tn,
-            'FP': fp,
-            'FN': fn
+            "StatPos": stat_pos,
+            "StatNeg": stat_neg
         }
         self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
 
@@ -47,11 +44,10 @@ class TestAucOp(OpTest):
         python_auc.update(pred, labels)
 
         self.outputs = {
-            'AUC': python_auc.eval(),
-            'TPOut': python_auc.tp_list,
-            'FNOut': python_auc.fn_list,
-            'TNOut': python_auc.tn_list,
-            'FPOut': python_auc.fp_list
+            'AUC': np.array(python_auc.eval()),
+            'BatchAUC': np.array(python_auc.eval()),
+            'StatPosOut': np.array(python_auc._stat_pos),
+            'StatNegOut': np.array(python_auc._stat_neg)
         }
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 58875a1dd19fd91f6f2bed928397ee7f73302dff..c0f5da5a1ae43847dff6348ea5f3e3bfd5e89ab9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -55,6 +55,7 @@ class TestDistRunnerBase(object):
         pserver_prog = t.get_pserver_program(args.current_endpoint)
         startup_prog = t.get_startup_program(args.current_endpoint,
                                              pserver_prog)
+
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(startup_prog)
@@ -147,6 +148,8 @@ def runtime_main(test_class):
 
 
 import paddle.compat as cpt
+import socket
+from contextlib import closing
 
 
 class TestDistBase(unittest.TestCase):
@@ -156,13 +159,19 @@ class TestDistBase(unittest.TestCase):
     def setUp(self):
         self._trainers = 2
         self._pservers = 2
-        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
         self._python_interp = "python"
         self._sync_mode = True
         self._mem_opt = False
         self._use_reduce = False
         self._setup_config()
 
+    def _find_free_port(self):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.bind(('', 0))
+            return s.getsockname()[1]
+
     def start_pserver(self, model_file, check_error_log):
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
         ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 1d9ab44ed447468fb8383c52747d14970ae27ced..b85501ef6b80d1f5004aa0dd08c3123d3bda48a5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -438,7 +438,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
         # 2 optimize for table adam
         # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
         self.assertEqual([op.type for op in pserver1.blocks[2].ops],
-                         ["sum", "adam", "scale", "scale"])
+                         ["sum", "scale", "adam", "scale", "scale"])
 
         trainer, _ = self.get_trainer()
         self.assertEqual(len(trainer.blocks), 1)
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index cc0494774a5f2f24faaae65f193fc3ff9270d9ac..820ad4af88e9dc49cbe57ac182e1ba0402725f3d 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -21,28 +21,41 @@ from op_test import OpTest
 
 class TestFakeQuantizeOp(OpTest):
     def setUp(self):
-        self.op_type = "fake_quantize"
+        self.op_type = "fake_quantize_abs_max"
+        self.attrs = {'bit_length': 8}
+        self.inputs = {'X': np.random.random((124, 240)).astype("float32"), }
+        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+        self.outputs = {
+            'Out': np.round(self.inputs['X'] / scale * (
+                (1 << (self.attrs['bit_length'] - 1)) - 1)),
+            'OutScale': np.array(scale).astype("float32"),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFakeQuantizeOp(OpTest):
+    def setUp(self):
+        self.op_type = "fake_quantize_range_abs_max"
         self.attrs = {
-            'bit_length': 8,
-            'quantize_type': 'abs_max',
-            'window_size': 10000
+            'bit_length': int(5),
+            'window_size': int(1),
+            'is_test': False
         }
         self.inputs = {
-            'X': np.random.random((10, 10)).astype("float32"),
-            'InScales': np.zeros(self.attrs['window_size']).astype("float32"),
-            'InCurrentIter': np.zeros(1).astype("float32"),
-            'InMovingScale': np.zeros(1).astype("float32")
-        }
-        self.scale = {
-            'abs_max': np.max(np.abs(self.inputs['X'])).astype("float32")
+            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'Iter': np.zeros(1).astype("int64"),
+            'InScale': np.zeros(1).astype("float32")
         }
+        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+        out_scales = np.zeros(self.attrs['window_size']).astype("float32")
+        out_scales[0] = scale
         self.outputs = {
-            'Out': np.round(self.inputs['X'] / self.scale['abs_max'] * (
+            'Out': np.round(self.inputs['X'] / scale * (
                 (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScales': np.zeros(self.attrs['window_size']).astype("float32"),
-            'OutMovingScale':
-            np.array([self.scale['abs_max']]).astype("float32"),
-            'OutCurrentIter': np.zeros(1).astype("float32")
+            'OutScale': scale,
+            'OutScales': out_scales,
         }
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_op.py b/python/paddle/fluid/tests/unittests/test_flatten_op.py
index 17b01e03124e8007c51107b414c628d4bfc49c79..effa2a148eef8b0047b12c676803abb2871e8118 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
@@ -22,14 +22,17 @@ from op_test import OpTest
 
 class TestFlattenOp(OpTest):
     def setUp(self):
-        self.op_type = "flatten"
+        self.op_type = "flatten2"
         self.init_test_case()
         self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
         self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.in_shape).astype("float32")
+        }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=["XShape"])
 
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
index 764f83b534c8a183dbf21511f0b05741c13c9528..36ebc8fb6ea9efdcd1807f5c8917ab1428b3381e 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
@@ -37,7 +37,7 @@ def fusion_gru(
                h0,
                wh,
                np.zeros(
-                   (1, wh.shape[1]), dtype='float64'),
+                   (1, wh.shape[1]), dtype='float32'),
                is_reverse,
                act_state,
                act_gate)
@@ -62,15 +62,15 @@ class TestFusionGRUOp(OpTest):
         T = sum(self.lod[0])
         N = len(self.lod[0])
 
-        x = np.random.rand(T, self.M).astype('float64')
-        wx = np.random.rand(self.M, 3 * self.D).astype('float64')
-        wh = np.random.rand(self.D, 3 * self.D).astype('float64')
+        x = np.random.rand(T, self.M).astype('float32')
+        wx = np.random.rand(self.M, 3 * self.D).astype('float32')
+        wh = np.random.rand(self.D, 3 * self.D).astype('float32')
         bias = np.random.rand(
-            1, 3 * self.D).astype('float64') if self.with_bias else np.zeros(
-                (1, 3 * self.D), dtype='float64')
+            1, 3 * self.D).astype('float32') if self.with_bias else np.zeros(
+                (1, 3 * self.D), dtype='float32')
         h0 = np.random.rand(
-            N, self.D).astype('float64') if self.with_h0 else np.zeros(
-                (N, self.D), dtype='float64')
+            N, self.D).astype('float32') if self.with_h0 else np.zeros(
+                (N, self.D), dtype='float32')
 
         _, _, _, hidden = fusion_gru(
             x, self.lod, h0, wx, wh, bias, self.is_reverse,
@@ -93,7 +93,9 @@ class TestFusionGRUOp(OpTest):
         }
 
     def test_check_output(self):
-        self.check_output(atol=1e-8)
+        for use_seq in {True, False}:
+            self.attrs['use_seq'] = use_seq
+            self.check_output()
 
 
 class TestFusionGRUOpNoInitial(TestFusionGRUOp):
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
index 5805bdf461998e90611dec05b079cd55feda520d..de0c86f96db958eebd7e74346bec244f0c804ed9 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -53,11 +53,11 @@ class TestFusionLSTMOp(OpTest):
         self.M = 8
         self.D = 16
         self.has_initial_state = False
+        self.use_peepholes = False
         self.is_reverse = False
         self.act_gate = 'sigmoid'
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
-        self.use_peepholes = False
         self.set_conf()
 
         T = sum(self.lod[0])
@@ -114,7 +114,9 @@ class TestFusionLSTMOp(OpTest):
         }
 
     def test_check_output(self):
-        self.check_output()
+        for use_seq in {True, False}:
+            self.attrs['use_seq'] = use_seq
+            self.check_output()
 
 
 class TestFusionLSTMOpInit(TestFusionLSTMOp):
@@ -157,5 +159,36 @@ class TestFusionLSTMOpBS1(TestFusionLSTMOp):
         self.D = 16
 
 
+class TestFusionLSTMOpPeepholes(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+
+
+class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.has_initial_state = True
+
+
+class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.is_reverse = True
+
+
+class TestFusionLSTMOpPeepholesInitReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.has_initial_state = True
+        self.is_reverse = True
+
+
+class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.lod = [[2]]
+        self.D = 8
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
index ce766fffbce98a6a2cee4c508d6db85ee0163401..6dc101b6dad8813893c6a891da0e16f952bb4c2d 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
@@ -177,8 +177,8 @@ def _box_to_delta(ex_boxes, gt_boxes, weights):
 
     dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]
     dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]
-    dw = (np.log(gt_w / ex_w)) / ex_w / weights[2]
-    dh = (np.log(gt_h / ex_h)) / ex_h / weights[3]
+    dw = (np.log(gt_w / ex_w)) / weights[2]
+    dh = (np.log(gt_h / ex_h)) / weights[3]
 
     targets = np.vstack([dx, dy, dw, dh]).transpose()
     return targets
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index f2fccd5d76777b183823c684d5e5ace317925bb0..bc4d364c74c6cb6b8f0df59e7ede77e6271f4b96 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -521,6 +521,20 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_pad2d(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            out = layers.pad2d(
+                input,
+                paddings=[1, 2, 3, 4],
+                mode='reflect',
+                data_format='NCHW',
+                name="shape")
+            self.assertIsNotNone(out)
+        print(str(program))
+
     def test_prelu(self):
         program = Program()
         with program_guard(program):
@@ -535,6 +549,13 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_sequence_enumerate(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="input", shape=[1], dtype='int32', lod_level=1)
+            out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pad2d_op.py b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..728b8c181a4410d7df7f304bcc8d2816e91ea6d8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestPad2dOp(OpTest):
+    def setUp(self):
+        self.pad_value = 0.0
+        self.initTestCase()
+        self.op_type = "pad2d"
+        self.inputs = {'X': np.random.random(self.shape).astype("float32"), }
+        self.attrs = {}
+        self.attrs['paddings'] = np.array(self.paddings).flatten()
+        self.attrs['pad_value'] = self.pad_value
+        self.attrs['mode'] = self.mode
+        self.attrs['data_format'] = self.data_format
+        if self.data_format == "NCHW":
+            paddings = [(0, 0), (0, 0), (self.paddings[0], self.paddings[1]),
+                        (self.paddings[2], self.paddings[3])]
+        else:
+            paddings = [(0, 0), (self.paddings[0], self.paddings[1]),
+                        (self.paddings[2], self.paddings[3]), (0, 0)]
+        if self.mode == "constant":
+            out = np.pad(self.inputs['X'],
+                         paddings,
+                         mode=self.mode,
+                         constant_values=self.pad_value)
+        else:
+            out = np.pad(self.inputs['X'], paddings, mode=self.mode)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.006)
+
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 4)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "constant"
+        self.data_format = "NCHW"
+        self.pad_value = 0.0
+
+
+class TestCase1(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 4)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NCHW"
+
+
+class TestCase2(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 4)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "edge"
+        self.data_format = "NCHW"
+
+
+class TestCase3(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 4, 2)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NHWC"
+
+
+class TestCase4(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 4, 2)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "edge"
+        self.data_format = "NHWC"
+
+
+class TestCase5(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 4, 2)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "constant"
+        self.pad_value = 1.2
+        self.data_format = "NHWC"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 372ef748b2e704fd3858c382e048e51448ed3bd5..a49c5d9b43ae1bffa7cb57764db497f68030b151 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -85,6 +85,7 @@ class TestFetchOp(unittest.TestCase):
                     assert not math.isnan(np.sum(ret[i])) and \
                            not math.isinf(np.sum(ret[i]))
 
+    @unittest.skip(reason="CI timeout")
     def test_fetch_op(self):
         tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
         tst_reader_iter = tst_reader()
@@ -139,6 +140,7 @@ class TestFeedParallel(unittest.TestCase):
             if batch_id == 2:
                 break
 
+    @unittest.skip(reason="CI timeout")
     def test_feed_op(self):
         os.environ['CPU_NUM'] = str(4)
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index 1e3e40d54a78045c8d8fdd9a3a3715107d1e7a80..48a6b0577b6787d2e1231fdcbe6d2c1bb46414ed 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import six
 from op_test import OpTest
 
 
@@ -62,17 +63,20 @@ class PReluTest(OpTest):
 
 
 # TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues
-#  class TestCase1(PReluTest):
-#  def initTestCase(self):
-#  self.attrs = {'mode': "all"}
+if six.PY2:
 
-#  class TestCase2(PReluTest):
-#  def initTestCase(self):
-#  self.attrs = {'mode': "channel"}
+    class TestCase1(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "all"}
+
+    class TestCase2(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "channel"}
+
+    class TestCase3(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "element"}
 
-#  class TestCase3(PReluTest):
-#  def initTestCase(self):
-#  self.attrs = {'mode': "element"}
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 1de35dc35b0176b77eb2d9b25cd6ee4e645e56c3..0557593657e2e480a509902a07f25723b2c710b0 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -22,106 +22,39 @@ from op_test import OpTest
 
 class TestReshapeOp(OpTest):
     def setUp(self):
-        ori_shape = (2, 25)
-        new_shape = (5, 10)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInfer1(OpTest):
-    def setUp(self):
-        ori_shape = (5, 10)
-        new_shape = (5, -1, 5)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInfer2(OpTest):
-    def setUp(self):
-        ori_shape = (2, 2, 6)
-        new_shape = (2, 0, 3, -1)
-        infered_shape = (2, 2, 3, -1)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpInplace(OpTest):
-    def setUp(self):
-        ori_shape = (2, 25)
-        new_shape = (5, 10)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInferInplace1(OpTest):
-    def setUp(self):
-        ori_shape = (5, 10)
-        new_shape = (5, -1, 5)
+        self.init_data()
+        self.op_type = "reshape2"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {"shape": self.new_shape}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
 
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+    def init_data(self):
+        self.ori_shape = (2, 25)
+        self.new_shape = (5, 10)
+        self.infered_shape = (5, 10)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInferInplace2(OpTest):
-    def setUp(self):
-        ori_shape = (2, 2, 6)
-        new_shape = (2, 0, 3, -1)
-        infered_shape = (2, 2, 3, -1)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
+class TestReshapeOpDimInfer1(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (5, 10)
+        self.new_shape = (5, -1, 5)
+        self.infered_shape = (5, -1, 5)
 
-    def test_check_output(self):
-        self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+class TestReshapeOpDimInfer2(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (2, 2, 6)
+        self.new_shape = (2, 0, 3, -1)
+        self.infered_shape = (2, 2, 3, -1)
 
 
 class TestReshapeOpWithInputShape(OpTest):
@@ -130,20 +63,23 @@ class TestReshapeOpWithInputShape(OpTest):
         new_shape = (0, -1, 5)
         actual_shape = (2, 3, 5)
 
-        self.op_type = "reshape"
+        self.op_type = "reshape2"
         self.inputs = {
             "X": np.random.random(ori_shape).astype("float32"),
             "Shape": np.array(
                 actual_shape, dtype="int32")
         }
         self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(actual_shape),
+            'XShape': np.random.random(ori_shape).astype("float32")
+        }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", sum_outputs=["Out"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index 3d4623c74d9a307b12ab6d72ad0b4d2dae938720..70848e4e2239e2be160bb0c1a28a5aecd01a87dc 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -15,90 +15,164 @@
 from __future__ import print_function
 
 import unittest
+
 import numpy as np
-from op_test import OpTest
-
-
-class TestRmspropOp1(OpTest):
-    ''' Test RMSProp with explicit inputs
-    '''
-
-    def setUp(self):
-        self.op_type = "rmsprop"
-
-        param = np.random.random((123, 321)).astype("float32")
-        mean_square = np.random.random((123, 321)).astype("float32")
-        learning_rate = np.array([0.01]).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-
-        epsilon = 1e-6
-        decay = 0.9
-        momentum = 0.0
-
-        self.inputs = {
-            'Param': param,
-            'MeanSquare': mean_square,
-            'LearningRate': learning_rate,
-            'Grad': grad,
-            'Moment': moment,
-        }
-
-        self.attrs = {'epsilon': epsilon, 'decay': decay, 'momentum': momentum}
-
-        ms_out = decay * mean_square + (1 - decay) * grad * grad
-        moment_out = momentum * moment + \
-            learning_rate * grad / np.sqrt(ms_out + epsilon)
-        param_out = param - moment_out
-
-        self.outputs = {
-            'ParamOut': param_out,
-            'MomentOut': moment_out,
-            'MeanSquareOut': ms_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRmspropOp2(OpTest):
-    '''Test RMSProp with default values for attributes
-    '''
-
-    def setUp(self):
-        self.op_type = "rmsprop"
-
-        param = np.random.random((123, 321)).astype("float32")
-        mean_square = np.random.random((123, 321)).astype("float32")
-        learning_rate = np.array([0.01]).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-
-        epsilon = 1.0e-10
-        decay = 0.9
-        momentum = 0.0
-
-        self.inputs = {
-            'Param': param,
-            'MeanSquare': mean_square,
-            'LearningRate': learning_rate,
-            'Grad': grad,
-            'Moment': moment,
-        }
-
-        ms_out = decay * mean_square + (1 - decay) * grad * grad
-        moment_out = momentum * moment + \
-            learning_rate * grad / np.sqrt(ms_out + epsilon)
-        param_out = param - moment_out
-
-        self.outputs = {
-            'ParamOut': param_out,
-            'MomentOut': moment_out,
-            'MeanSquareOut': ms_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+class TestBase(unittest.TestCase):
+    def setup(self, centered, epsilon=1e-6):
+        np.random.seed(5)  # fix seed
+
+        self.param_name = "param"
+        self.param = np.random.random((123, 321)).astype("float32")
+
+        self.mean_square_name = "mean_square"
+        self.mean_square = np.random.random((123, 321)).astype("float32")
+
+        self.mean_grad_name = "mean_grad"
+        self.mean_grad = np.random.random((123, 321)).astype("float32")
+
+        self.lr_name = "lr"
+        self.learning_rate = np.array([0.01]).astype("float32")
+
+        self.grad_name = "grad"
+        self.grad = np.random.random((123, 321)).astype("float32")
+
+        self.moment_name = "moment"
+        self.moment = np.zeros((123, 321)).astype("float32")
+
+        self.epsilon = epsilon
+        self.decay = 0.9
+        self.momentum = 0.0
+        self.centered = centered
+
+        self.ms_out = self.decay * self.mean_square + (1 - self.decay
+                                                       ) * self.grad * self.grad
+        if centered:
+            self.mg_out = self.decay * self.mean_grad + (1 - self.decay
+                                                         ) * self.grad
+            self.moment_out = self.momentum * self.moment + \
+                              self.learning_rate * self.grad / np.sqrt(self.ms_out - np.square(self.mg_out) + self.epsilon)
+        else:
+            self.moment_out = self.momentum * self.moment + \
+                              self.learning_rate * self.grad / np.sqrt(self.ms_out + self.epsilon)
+
+        self.param_out = self.param - self.moment_out
+
+    def check(self,
+              actual_t,
+              expect_t,
+              place,
+              out_name,
+              atol=1e-5,
+              equal_nan=False):
+        self.assertTrue(
+            np.allclose(
+                actual_t, expect_t, atol=atol, equal_nan=equal_nan),
+            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+            + str(expect_t) + "\n" + "But Got" + str(actual_t))
+
+
+class TestRmspropOp(TestBase):
+    def check_with_place(self, place, centered, epsilon):
+        self.setup(centered, epsilon)
+        scope = core.Scope()
+
+        # create and initialize Param Variable
+        param = scope.var(self.param_name).get_tensor()
+        param.set(self.param, place)
+
+        mean_square = scope.var(self.mean_square_name).get_tensor()
+        mean_square.set(self.mean_square, place)
+
+        lr = scope.var(self.lr_name).get_tensor()
+        lr.set(self.learning_rate, place)
+
+        grad = scope.var(self.grad_name).get_tensor()
+        grad.set(self.grad, place)
+
+        moment = scope.var(self.moment_name).get_tensor()
+        moment.set(self.moment, place)
+
+        # create and run sgd operator
+
+        if self.centered:
+            mean_grad = scope.var(self.mean_grad_name).get_tensor()
+            mean_grad.set(self.mean_grad, place)
+
+            rmsprop_op = Operator(
+                "rmsprop",
+                Param=self.param_name,
+                Grad=self.grad_name,
+                MeanSquare=self.mean_square_name,
+                MeanGrad=self.mean_grad_name,
+                Moment=self.moment_name,
+                LearningRate=self.lr_name,
+                ParamOut=self.param_name,
+                MeanSquareOut=self.mean_square_name,
+                MomentOut=self.moment_name,
+                MeanGradOut=self.mean_grad_name,
+                epsilon=self.epsilon,
+                decay=self.decay,
+                momentum=self.momentum,
+                centered=True)
+        else:
+            rmsprop_op = Operator(
+                "rmsprop",
+                Param=self.param_name,
+                Grad=self.grad_name,
+                MeanSquare=self.mean_square_name,
+                Moment=self.moment_name,
+                LearningRate=self.lr_name,
+                ParamOut=self.param_name,
+                MeanSquareOut=self.mean_square_name,
+                MomentOut=self.moment_name,
+                epsilon=self.epsilon,
+                decay=self.decay,
+                momentum=self.momentum,
+                centered=False)
+
+        rmsprop_op.run(scope, place)
+
+        atol = 1e-5
+        equal_nan = False
+
+        if self.centered:
+            atol = 1e-3
+            equal_nan = True
+
+        self.check(
+            np.array(mean_square), self.ms_out, place, self.mean_square_name)
+        self.check(
+            np.array(moment),
+            self.moment_out,
+            place,
+            self.moment_name,
+            atol=atol,
+            equal_nan=equal_nan)
+        self.check(
+            np.array(param),
+            self.param_out,
+            place,
+            self.param_name,
+            atol=atol,
+            equal_nan=equal_nan)
+
+        if self.centered:
+            self.check(
+                np.array(mean_grad), self.mg_out, place, self.mean_grad_name)
+
+    def test_rmsprop(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place, False, 1e-6)
+            self.check_with_place(place, False, 1e-10)
+            self.check_with_place(place, True, 1e-6)
+            self.check_with_place(place, True, 1e-10)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index ed7f467835f32242a9650f226b4a5ad9d6d87af4..ad4cd2e803bfae4c3fbc04503331b9a786b25d17 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -61,7 +61,7 @@ class TestROIPoolOp(OpTest):
 
         for i in range(self.rois_num):
             roi = self.rois[i]
-            roi_batch_id = roi[0]
+            roi_batch_id = int(roi[0])
             roi_start_w = int(cpt.round(roi[1] * self.spatial_scale))
             roi_start_h = int(cpt.round(roi[2] * self.spatial_scale))
             roi_end_w = int(cpt.round(roi[3] * self.spatial_scale))
@@ -125,7 +125,7 @@ class TestROIPoolOp(OpTest):
                 roi = [bno, x1, y1, x2, y2]
                 rois.append(roi)
         self.rois_num = len(rois)
-        self.rois = np.array(rois).astype("int64")
+        self.rois = np.array(rois).astype("float32")
 
     def setUp(self):
         self.op_type = "roi_pool"
diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
index 08c462d9036cacab81dab7c9ea16664c9159479f..bd548009b3ada9512e4b5f7d7b61b67b0717a39b 100644
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -18,12 +18,17 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
+from test_anchor_generator_op import anchor_generator_in_python
+from test_generate_proposal_labels import _generate_groundtruth
+from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta
 
 
-def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,
-                      rpn_negative_overlap, fg_fraction):
-    iou = np.transpose(iou)
+def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im,
+                      rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
+    iou = np.transpose(gt_anchor_iou)
     anchor_to_gt_max = iou.max(axis=1)
+    anchor_to_gt_argmax = iou.argmax(axis=1)
+
     gt_to_anchor_argmax = iou.argmax(axis=0)
     gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])]
     anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0]
@@ -42,59 +47,113 @@ def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,
 
     num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1)
     bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
+    tgt_lbl[bg_inds] = 0
     if len(bg_inds) > num_bg:
         enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
         tgt_lbl[enable_inds] = 0
     bg_inds = np.where(tgt_lbl == 0)[0]
+    tgt_lbl[bg_inds] = 0
 
     loc_index = fg_inds
     score_index = np.hstack((fg_inds, bg_inds))
     tgt_lbl = np.expand_dims(tgt_lbl, axis=1)
-    return loc_index, score_index, tgt_lbl
+
+    gt_inds = anchor_to_gt_argmax[fg_inds]
+
+    return loc_index, score_index, tgt_lbl, gt_inds
+
+
+def get_anchor(n, c, h, w):
+    input_feat = np.random.random((n, c, h, w)).astype('float32')
+    anchors, _ = anchor_generator_in_python(
+        input_feat=input_feat,
+        anchor_sizes=[32., 64.],
+        aspect_ratios=[0.5, 1.0],
+        variances=[1.0, 1.0, 1.0, 1.0],
+        stride=[16.0, 16.0],
+        offset=0.5)
+    return anchors
+
+
+def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im,
+             rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
+
+    loc_indexes = []
+    score_indexes = []
+    tmp_tgt_labels = []
+    tgt_bboxes = []
+    anchor_num = anchor.shape[0]
+
+    batch_size = len(lod) - 1
+    for i in range(batch_size):
+        b, e = lod[i], lod[i + 1]
+        iou_slice = iou[b:e, :]
+        bboxes_slice = gt_boxes[b:e, :]
+
+        loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign(
+            iou_slice, rpn_batch_size_per_im, rpn_positive_overlap,
+            rpn_negative_overlap, fg_fraction)
+
+        fg_bboxes = bboxes_slice[gt_inds]
+        fg_anchors = anchor[loc_idx]
+        box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.])
+
+        if i == 0:
+            loc_indexes = loc_idx
+            score_indexes = score_idx
+            tmp_tgt_labels = tgt_lbl
+            tgt_bboxes = box_deltas
+        else:
+            loc_indexes = np.concatenate(
+                [loc_indexes, loc_idx + i * anchor_num])
+            score_indexes = np.concatenate(
+                [score_indexes, score_idx + i * anchor_num])
+            tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl])
+            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
+
+    tgt_labels = tmp_tgt_labels[score_indexes]
+    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels
 
 
 class TestRpnTargetAssignOp(OpTest):
     def setUp(self):
-        iou = np.random.random((10, 8)).astype("float32")
-        self.op_type = "rpn_target_assign"
-        self.inputs = {'DistMat': iou}
-        self.attrs = {
-            'rpn_batch_size_per_im': 256,
-            'rpn_positive_overlap': 0.95,
-            'rpn_negative_overlap': 0.3,
-            'fg_fraction': 0.25,
-            'fix_seed': True
-        }
-        loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 256, 0.95, 0.3,
-                                                            0.25)
-        self.outputs = {
-            'LocationIndex': loc_index,
-            'ScoreIndex': score_index,
-            'TargetLabel': tgt_lbl,
-        }
+        n, c, h, w = 2, 4, 14, 14
+        anchor = get_anchor(n, c, h, w)
+        gt_num = 10
+        anchor = anchor.reshape(-1, 4)
+        anchor_num = anchor.shape[0]
 
-    def test_check_output(self):
-        self.check_output()
+        im_shapes = [[64, 64], [64, 64]]
+        gt_box, lod = _generate_groundtruth(im_shapes, 3, 4)
+        bbox = np.vstack([v['boxes'] for v in gt_box])
 
+        iou = _bbox_overlaps(bbox, anchor)
+
+        anchor = anchor.astype('float32')
+        bbox = bbox.astype('float32')
+        iou = iou.astype('float32')
+
+        loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob(
+            anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25)
 
-class TestRpnTargetAssignOp2(OpTest):
-    def setUp(self):
-        iou = np.random.random((10, 20)).astype("float32")
         self.op_type = "rpn_target_assign"
-        self.inputs = {'DistMat': iou}
+        self.inputs = {
+            'Anchor': anchor,
+            'GtBox': (bbox, [[4, 4]]),
+            'DistMat': (iou, [[4, 4]]),
+        }
         self.attrs = {
-            'rpn_batch_size_per_im': 128,
-            'rpn_positive_overlap': 0.5,
-            'rpn_negative_overlap': 0.5,
-            'fg_fraction': 0.5,
+            'rpn_batch_size_per_im': 25600,
+            'rpn_positive_overlap': 0.95,
+            'rpn_negative_overlap': 0.03,
+            'fg_fraction': 0.25,
             'fix_seed': True
         }
-        loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 128, 0.5, 0.5,
-                                                            0.5)
         self.outputs = {
-            'LocationIndex': loc_index,
-            'ScoreIndex': score_index,
-            'TargetLabel': tgt_lbl,
+            'LocationIndex': loc_index.astype('int32'),
+            'ScoreIndex': score_index.astype('int32'),
+            'TargetBBox': tgt_bbox.astype('float32'),
+            'TargetLabel': tgt_lbl.astype('int64'),
         }
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
index 708265b4576809b1f4157d54989c6138c6e5a2b0..674ef2ddf44edb4246c9d952cb75b36fe3d6ddc8 100644
--- a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
@@ -25,9 +25,9 @@ class TestSamplingIdOp(OpTest):
         self.op_type = "sampling_id"
         self.use_mkldnn = False
         self.init_kernel_type()
-        self.X = np.random.random((8, 4)).astype('float32')
+        self.X = np.random.random((100, 10)).astype('float32')
         self.inputs = {"X": self.X}
-        self.Y = np.random.random(8).astype('float32')
+        self.Y = np.random.random(100).astype('int64')
         self.outputs = {'Out': self.Y}
         self.attrs = {'max': 1.0, 'min': 0.0, 'seed': 1}
 
@@ -36,6 +36,16 @@ class TestSamplingIdOp(OpTest):
         y1 = self.out
         self.check_output_customized(self.verify_output)
         y2 = self.out
+
+        # check dtype
+        assert y1.dtype == np.int64
+        assert y2.dtype == np.int64
+
+        # check output is index ids of inputs
+        inputs_ids = np.arange(self.X.shape[1])
+        assert np.isin(y1, inputs_ids).all()
+        assert np.isin(y2, inputs_ids).all()
+
         self.assertTrue(np.array_equal(y1, y2))
         self.assertEqual(len(y1), len(self.Y))
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py b/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9814ec0a15e1803b356f300d378c31e57ba36c09
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
@@ -0,0 +1,105 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def sequence_enumerate(input_seq, in_lod, win_size, pad_value):
+    lod0 = [0]
+    for i in range(0, len(in_lod[0])):
+        lod0.append(lod0[i] + in_lod[0][i])
+    out_seq = []
+    for i in range(0, len(lod0) - 1):
+        for idx in range(lod0[i], lod0[i + 1]):
+            single_seq = []
+            for word_idx in range(win_size):
+                word_pos = idx + word_idx
+                dat = input_seq[word_pos] if word_pos < lod0[i+1] \
+                    else pad_value
+                single_seq.append(dat)
+            out_seq.append(single_seq)
+    return out_seq
+
+
+class TestSequenceEnumerateOp(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_enumerate"
+        self.init_test_case()
+        self.inputs = {'X': (self.in_seq, self.lod)}
+        self.attrs = {'win_size': self.win_size, 'pad_value': self.pad_value}
+        self.outputs = {'Out': (self.out_seq, self.lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 2
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+class TesSequenceEnumerateOpInt64(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 2
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int64")
+
+
+class TestSequenceEnumerateOpLargeWinSize(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 5
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+class TestSequenceEnumerateOpMaxWinSize(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 30
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+class TestSequenceEnumerateOpLargePadValue(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 5
+        self.pad_value = 5
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
index 2be8e24a0fae6945351eb767ac924d7ca70848ab..204a4bb40196bd1fc2f5861aa31cf9560ea4d349 100644
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -23,14 +23,17 @@ from op_test import OpTest
 # Correct: General.
 class TestSqueezeOp(OpTest):
     def setUp(self):
-        self.op_type = "squeeze"
+        self.op_type = "squeeze2"
         self.init_test_case()
         self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
         self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 0853f80b82030679d140f7fabdd42557c2374599..c30da2389d50d3b6bdf1f911aaed6ed71f274153 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -22,16 +22,19 @@ from op_test import OpTest
 class TestTransposeOp(OpTest):
     def setUp(self):
         self.initTestCase()
-        self.op_type = "transpose"
+        self.op_type = "transpose2"
         self.inputs = {'X': np.random.random(self.shape).astype("float32")}
         self.attrs = {'axis': list(self.axis)}
-        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("float32"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', sum_outputs=['Out'])
 
     def initTestCase(self):
         self.shape = (3, 4)
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index a324438ba5a3c3b57fd956bd11189ef7d50267e2..14dd2bb06f9a18d0b15a4aee4e9e6bfdf8c41206 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -24,13 +24,16 @@ from op_test import OpTest
 class TestUnsqueezeOp(OpTest):
     def setUp(self):
         self.init_test_case()
-        self.op_type = "unsqueeze"
+        self.op_type = "unsqueeze2"
         self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
         self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=["XShape"])
 
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index d094647afe1900809fc32cae93f777765f72c675..30cdfe4ad2c9892184862b70ff49417ce5a08516 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -431,6 +431,28 @@ class Trainer(object):
             exe = executor.Executor(self.place)
             io.save_persistables(exe, dirname=param_path)
 
+    def save_inference_model(self, param_path, feeded_var_names,
+                             target_var_indexes):
+        """
+        Save model for cpp inference into :code:`param_path`.
+
+        Args:
+            param_path(str): The path to save parameters.
+            feeded_var_names(list(str)): The name of the vars that you
+                need to feed in before run program.
+            target_var_indexes(list(int)): the index of target var that
+                you need to return in trainer.train_func.
+        Returns:
+            None
+        """
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            target_vars = [
+                self.train_func_outputs[index] for index in target_var_indexes
+            ]
+            io.save_inference_model(param_path, feeded_var_names, target_vars,
+                                    exe)
+
     @contextlib.contextmanager
     def _prog_and_scope_guard(self):
         with framework.program_guard(
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
index f0fafaa84a73d641ff6ceb74def6addaea759516..a83aa0f11eed9bfc1674d8d75dcfacc297f056b0 100644
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -153,7 +153,7 @@ def block_to_code(block, block_idx):
 
     indent += 1
     # sort all vars
-    all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0])
+    all_vars = sorted(six.iteritems(block.vars), key=lambda x: x[0])
     for var in all_vars:
         print("{}{}".format(get_indent_space(indent), variable_to_code(var[1])))
 
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index a6266a7b0c9ac40eac7b2823fc7ddf38f55357a9..d4d218d547a394a56c040ade2a9ba703b691b86b 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -300,7 +300,7 @@ class DistributeTranspiler(object):
             input_deps = grad_name_to_send_dummy_out.values()
             program.global_block().append_op(
                 type="send_barrier",
-                inputs={"X": input_deps},
+                inputs={"X": list(input_deps)},
                 outputs={"Out": send_barrier_out},
                 attrs={
                     "endpoints": pserver_endpoints,
@@ -401,7 +401,7 @@ class DistributeTranspiler(object):
 
         Args:
             recv_vars (list): Variable list to recv for current trainer_id
-            eplist (list): A list of strings indicating 
+            eplist (list): A list of strings indicating
 
         Returns:
             Program: trainer side startup program.
@@ -455,7 +455,7 @@ class DistributeTranspiler(object):
             if len(splited_var) <= 1:
                 continue
             # NOTE: if enable memory optimization, origin vars maybe removed.
-            if startup_program.global_block().vars.has_key(varname):
+            if varname in startup_program.global_block().vars:
                 orig_param = startup_program.global_block().vars[varname]
             else:
                 origin_param_var = self.origin_program.global_block().vars[
@@ -690,7 +690,7 @@ class DistributeTranspiler(object):
 
         Args:
             endpoint (str): current pserver endpoint.
-        
+
         Returns:
             tuple: (main_program, startup_program), of type "Program"
         """
@@ -713,7 +713,7 @@ class DistributeTranspiler(object):
             endpoint (str): current pserver endpoint.
             pserver_program (Program): deprecated, call get_pserver_program first.
             startup_program (Program): deprecated, should pass startup_program
-                when initalizing 
+                when initalizing
 
         Returns:
             Program: parameter server side startup program.
@@ -1096,7 +1096,8 @@ class DistributeTranspiler(object):
             self.table_name]
 
         zero_dim = int(
-            math.ceil(origin_param_var.shape[0] / len(self.pserver_endpoints)))
+            math.ceil(origin_param_var.shape[0] / float(
+                len(self.pserver_endpoints))))
         table_shape = list(origin_param_var.shape)
         table_shape[0] = zero_dim
 
@@ -1390,13 +1391,11 @@ class DistributeTranspiler(object):
                 inputs={"X": vars2merge},
                 outputs={"Out": merged_var},
                 attrs={"use_mkldnn": False})
-            # TODO(panyx0718): What if it's SELECTED_ROWS.
-            if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-                optimize_block.append_op(
-                    type="scale",
-                    inputs={"X": merged_var},
-                    outputs={"Out": merged_var},
-                    attrs={"scale": 1.0 / float(self.trainer_num)})
+            optimize_block.append_op(
+                type="scale",
+                inputs={"X": merged_var},
+                outputs={"Out": merged_var},
+                attrs={"scale": 1.0 / float(self.trainer_num)})
         return merged_var
 
     def _append_pserver_ops(self, optimize_block, opt_op, endpoint,