Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_CudnnHolder_bug

8331e835 · Yang Yu · 56750e6a · 14242eae · 8331e835 · 8331e835
108 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -213,9 +213,11 @@ include(configure)          # add paddle env configuration
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
+endif()
+if(WITH_MKL OR WITH_MKLML)
    include(external/anakin)
 elseif()
-    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
+    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
 endif()
 include(generic)            # simplify cmake module

--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -11,6 +11,7 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
 # exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
 RUN pip install -U pip
 RUN pip install -U kubernetes paddlepaddle
@@ -27,5 +28,6 @@ ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl 
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
+ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
 ADD models/ /workspace/models/
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -17,7 +17,8 @@ import argparse
 __all__ = ['parse_args', ]
 BENCHMARK_MODELS = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "se_resnext", "vgg", "mnist",
+    "stacked_dynamic_lstm", "resnet_with_preprocess"
 ]
@@ -67,12 +68,12 @@ def parse_args():
        '--cpus',
        type=int,
        default=1,
-        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+        help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
    parser.add_argument(
        '--data_set',
        type=str,
        default='flowers',
-        choices=['cifar10', 'flowers'],
+        choices=['cifar10', 'flowers', 'imagenet'],
        help='Optional dataset for benchmark.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
@@ -122,6 +123,11 @@ def parse_args():
        type=str,
        default="",
        help='Directory that contains all the training recordio files.')
+    parser.add_argument(
+        '--test_data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the test data (NOT recordio).')
    parser.add_argument(
        '--use_inference_transpiler',
        action='store_true',
@@ -130,5 +136,9 @@ def parse_args():
        '--no_random',
        action='store_true',
        help='If set, keep the random seed and do not shuffle the data.')
+    parser.add_argument(
+        '--use_lars',
+        action='store_true',
+        help='If set, use lars for optimizers, ONLY support resnet module.')
    args = parser.parse_args()
    return args
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -16,6 +16,7 @@ import argparse
 import cProfile
 import time
 import os
+import traceback
 import numpy as np
@@ -27,7 +28,7 @@ import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
 from args import *
-def append_nccl2_prepare(trainer_id):
+def append_nccl2_prepare(trainer_id, startup_prog):
    if trainer_id >= 0:
        # append gen_nccl_id at the end of startup program
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
@@ -40,11 +41,11 @@ def append_nccl2_prepare(trainer_id):
        current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
        worker_endpoints.remove(current_endpoint)
-        nccl_id_var = fluid.default_startup_program().global_block().create_var(
+        nccl_id_var = startup_prog.global_block().create_var(
            name="NCCLID",
            persistable=True,
            type=fluid.core.VarDesc.VarType.RAW)
-        fluid.default_startup_program().global_block().append_op(
+        startup_prog.global_block().append_op(
            type="gen_nccl_id",
            inputs={},
            outputs={"NCCLID": nccl_id_var},
@@ -59,7 +60,7 @@ def append_nccl2_prepare(trainer_id):
                        "nccl-based dist train.")
-def dist_transpile(trainer_id, args):
+def dist_transpile(trainer_id, args, train_prog, startup_prog):
    if trainer_id < 0:
        return None, None
@@ -80,133 +81,69 @@ def dist_transpile(trainer_id, args):
    # the role, should be either PSERVER or TRAINER
    training_role = os.getenv("PADDLE_TRAINING_ROLE")
-    t = distribute_transpiler.DistributeTranspiler()
+    config = distribute_transpiler.DistributeTranspilerConfig()
+    config.slice_var_up = not args.no_split_var
+    t = distribute_transpiler.DistributeTranspiler(config=config)
    t.transpile(
        trainer_id,
+        # NOTE: *MUST* use train_prog, for we are using with guard to
+        # generate different program for train and test.
+        program=train_prog,
        pservers=pserver_endpoints,
        trainers=trainers,
        sync_mode=not args.async_mode)
    if training_role == "PSERVER":
        pserver_program = t.get_pserver_program(current_endpoint)
-        pserver_startup_program = t.get_startup_program(current_endpoint,
+        pserver_startup_program = t.get_startup_program(
-                                                        pserver_program)
+            current_endpoint, pserver_program, startup_program=startup_prog)
        return pserver_program, pserver_startup_program
    elif training_role == "TRAINER":
        train_program = t.get_trainer_program()
-        return train_program, fluid.default_startup_program()
+        return train_program, startup_prog
    else:
        raise ValueError(
            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
        )
-def test(exe, inference_program, test_reader, feeder, batch_acc):
+def test_parallel(exe, test_args, args, test_prog, feeder):
-    accuracy_evaluator = fluid.metrics.Accuracy()
+    acc_evaluators = []
-    for batch_id, data in enumerate(test_reader()):
+    for i in xrange(len(test_args[2])):
-        acc = exe.run(inference_program,
+        acc_evaluators.append(fluid.metrics.Accuracy())
-                      feed=feeder.feed(data),
-                      fetch_list=[batch_acc])
-        accuracy_evaluator.update(value=np.array(acc), weight=len(data))
-    return accuracy_evaluator.eval()
+    to_fetch = [v.name for v in test_args[2]]
+    if args.use_reader_op:
+        test_args[4].start()
-# TODO(wuyi): replace train, train_parallel, test functions with new trainer
-# API once it is ready.
-def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
-          args, train_prog, startup_prog):
-    if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
-        place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-        exe.run(train_prog)
-        return
-    if args.use_fake_data:
-        raise Exception(
-            "fake data is not supported in single GPU test for now.")
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(startup_prog)
-    # Use inference_transpiler to speedup
-    if not args.use_reader_op:
-        feed_var_list = [
-            var for var in train_prog.global_block().vars.itervalues()
-            if var.is_data
-        ]
-        feeder = fluid.DataFeeder(feed_var_list, place)
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        train_losses = []
-        if not args.use_reader_op:
-            reader_generator = train_reader()
-        batch_id = 0
-        data = None
        while True:
-            if not args.use_reader_op:
+            try:
-                data = next(reader_generator, None)
+                acc_rets = exe.run(fetch_list=to_fetch)
-                if data == None:
+                for i, e in enumerate(acc_evaluators):
-                    break
+                    e.update(
-            if iters == args.iterations:
+                        value=np.array(acc_rets[i]), weight=args.batch_size)
-                reader_generator.close()
+            except fluid.core.EOFException as eof:
+                test_args[4].reset()
                break
-            if iters == args.skip_batch_num:
+    else:
-                start_time = time.time()
+        for batch_id, data in enumerate(test_args[3]()):
-                num_samples = 0
+            acc_rets = exe.run(feed=feeder.feed(data), fetch_list=to_fetch)
+            for i, e in enumerate(acc_evaluators):
+                e.update(value=np.array(acc_rets[i]), weight=len(data))
-            if args.use_reader_op:
+    return [e.eval() for e in acc_evaluators]
-                try:
-                    loss = exe.run(train_prog, fetch_list=[avg_loss])
-                except fluid.core.EnforceNotMet as ex:
-                    break
-            else:
-                loss = exe.run(train_prog,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_loss])
-            iters += 1
-            batch_id += 1
-            # FIXME(wuyi): For use_reader_op, if the current
-            # pass is not the last, the last batch of this pass
-            # is also equal to args.batch_size.
-            if args.use_reader_op:
-                num_samples += args.batch_size * args.gpus
-            else:
-                num_samples += len(data)
-            train_losses.append(loss)
-            print("Pass: %d, Iter: %d, Loss: %f\n" %
-                  (pass_id, iters, np.mean(train_losses)))
-        print_train_time(start_time, time.time(), num_samples)
-        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
-        # evaluation
-        if not args.no_test and batch_acc and not args.use_reader_op:
-            if args.use_inference_transpiler:
-                t = fluid.InferenceTranspiler()
-                t.transpile(infer_prog, place)
-            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
-                                 batch_acc)
-            print(", Test Accuracy: %f" % pass_test_acc)
-        print("\n")
-        # TODO(wuyi): add warmup passes to get better perf data.
-        exit(0)
-# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# NOTE: only need to benchmark using parallelexe
-# API once it is ready.
+def train_parallel(train_args, test_args, args, train_prog, test_prog,
-def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
+                   startup_prog, nccl_id_var, num_trainers, trainer_id):
-                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
+    over_all_start = time.time()
-                   num_trainers, trainer_id):
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    feeder = None
    if not args.use_reader_op:
        feed_var_list = [
            var for var in train_prog.global_block().vars.itervalues()
            if var.is_data
        ]
        feeder = fluid.DataFeeder(feed_var_list, place)
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
@@ -230,63 +167,110 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
    startup_exe = fluid.Executor(place)
    startup_exe.run(startup_prog)
    strategy = fluid.ExecutionStrategy()
-    strategy.num_threads = 1
+    strategy.num_threads = args.cpus
    strategy.allow_op_delay = False
+    avg_loss = train_args[0]
+    if args.update_method == "pserver":
+        # parameter server mode distributed training, merge
+        # gradients on local server, do not initialize
+        # ParallelExecutor with multi server all-reduce mode.
+        num_trainers = 1
+        trainer_id = 0
    exe = fluid.ParallelExecutor(
        True,
        avg_loss.name,
+        main_program=train_prog,
        exec_strategy=strategy,
        num_trainers=num_trainers,
        trainer_id=trainer_id)
+    if not args.no_test:
+        if args.update_method == "pserver":
+            test_scope = None
+        else:
+            # NOTE: use an empty scope to avoid test exe using NCCLID
+            test_scope = fluid.Scope()
+        test_exe = fluid.ParallelExecutor(
+            True, main_program=test_prog, share_vars_from=exe)
    for pass_id in range(args.pass_num):
        num_samples = 0
        iters = 0
        start_time = time.time()
        if not args.use_reader_op:
-            reader_generator = train_reader()
+            reader_generator = train_args[3]()  #train_reader
        batch_id = 0
        data = None
+        if args.use_reader_op:
+            train_args[4].start()
        while True:
            if not args.use_reader_op:
                data = next(reader_generator, None)
                if data == None:
                    break
+            if args.profile and batch_id == 5:
+                profiler.start_profiler("All")
+                profiler.reset_profiler()
+            elif args.profile and batch_id == 10:
+                print("profiling total time: ", time.time() - start_time)
+                profiler.stop_profiler("total", "/tmp/profile_%d_pass%d" %
+                                       (trainer_id, pass_id))
            if iters == args.iterations:
                reader_generator.close()
                break
-            if args.profile and pass_id == 0 and batch_id == 5:
-                profiler.start_profiler("All")
-            elif args.profile and pass_id == 0 and batch_id == 10:
-                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
+            fetch_list = [avg_loss.name]
+            acc_name_list = [v.name for v in train_args[2]]
+            fetch_list.extend(acc_name_list)
            if args.use_fake_data or args.use_reader_op:
                try:
-                    loss, = exe.run([avg_loss.name])
+                    fetch_ret = exe.run(fetch_list)
+                except fluid.core.EOFException as eof:
+                    break
                except fluid.core.EnforceNotMet as ex:
+                    traceback.print_exc()
                    break
            else:
-                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+                fetch_ret = exe.run(fetch_list, feed=feeder.feed(data))
            if args.use_reader_op:
                num_samples += args.batch_size * args.gpus
            else:
                num_samples += len(data)
            iters += 1
            if batch_id % 1 == 0:
-                print("Pass %d, batch %d, loss %s" %
+                fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
-                      (pass_id, batch_id, np.array(loss)))
+                print("Pass %d, batch %d, loss %s, accucacys: %s" %
+                      (pass_id, batch_id, fetched_data[0], fetched_data[1:]))
            batch_id += 1
        print_train_time(start_time, time.time(), num_samples)
-        if not args.no_test and batch_acc and not args.use_reader_op:
+        if args.use_reader_op:
-            # we have not implement record io for test
+            train_args[4].reset()  # reset reader handle
-            # skip test when use args.use_reader_op
+        else:
-            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
+            del reader_generator
-                            batch_acc)
-            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
+        if not args.no_test and test_args[2]:
+            test_feeder = None
+            if not args.use_reader_op:
+                test_feed_var_list = [
+                    var for var in test_prog.global_block().vars.itervalues()
+                    if var.is_data
+                ]
+                test_feeder = fluid.DataFeeder(test_feed_var_list, place)
+            test_ret = test_parallel(test_exe, test_args, args, test_prog,
+                                     test_feeder)
+            print("Pass: %d, Test Accuracy: %s\n" %
+                  (pass_id, [np.mean(np.array(v)) for v in test_ret]))
+    print("total train time: ", time.time() - over_all_start)
 def print_arguments(args):
@@ -328,44 +312,46 @@ def main():
    if args.use_cprof:
        pr = cProfile.Profile()
        pr.enable()
    model_def = __import__("models.%s" % args.model, fromlist=["models"])
-    train_args = list(model_def.get_model(args))
-    train_args.append(args)
+    train_prog = fluid.Program()
-    # Run optimizer.minimize(avg_loss)
+    test_prog = fluid.Program()
-    train_args[2].minimize(train_args[0])
+    startup_prog = fluid.Program()
-    if args.memory_optimize:
-        fluid.memory_optimize(fluid.default_main_program())
+    train_args = list(model_def.get_model(args, True, train_prog, startup_prog))
+    test_args = list(model_def.get_model(args, False, test_prog, startup_prog))
+    all_args = [train_args, test_args, args]
    if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile(trainer_id, args)
+        train_prog, startup_prog = dist_transpile(trainer_id, args, train_prog,
+                                                  startup_prog)
        if not train_prog:
            raise Exception(
                "Must configure correct environments to run dist train.")
-        train_args.extend([train_prog, startup_prog])
+        all_args.extend([train_prog, test_prog, startup_prog])
        if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
-            train_args.extend([nccl_id_var, num_trainers, trainer_id])
+            all_args.extend([nccl_id_var, num_trainers, trainer_id])
-            train_parallel(*train_args)
+            train_parallel(*all_args)
-        train(*train_args)
+        elif os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
+            # start pserver with Executor
+            server_exe = fluid.Executor(fluid.CPUPlace())
+            server_exe.run(startup_prog)
+            server_exe.run(train_prog)
        exit(0)
    # for other update methods, use default programs
-    train_args.append(fluid.default_main_program())
+    all_args.extend([train_prog, test_prog, startup_prog])
-    train_args.append(fluid.default_startup_program())
    if args.update_method == "nccl2":
-        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
+        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(
-    if args.gpus == 1:
+            trainer_id, startup_prog)
-        # NOTE: parallel executor use profiler interanlly
-        if args.use_nvprof and args.device == 'GPU':
+    if args.device == "CPU":
-            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+        raise Exception("Only support GPU perf with parallel exe")
-                train(*train_args)
+    all_args.extend([nccl_id_var, num_trainers, trainer_id])
-        else:
+    train_parallel(*all_args)
-            train(*train_args)
-    else:
-        if args.device == "CPU":
-            raise Exception("Only support GPU perf with parallel exe")
-        train_args.extend([nccl_id_var, num_trainers, trainer_id])
-        train_parallel(*train_args)
 if __name__ == "__main__":

--- a/benchmark/fluid/imagenet_reader.py
+++ b/benchmark/fluid/imagenet_reader.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import math
+import random
+import functools
+import numpy as np
+from threading import Thread
+import subprocess
+import time
+from Queue import Queue
+import paddle
+from PIL import Image, ImageEnhance
+random.seed(0)
+DATA_DIM = 224
+THREAD = int(os.getenv("PREPROCESS_THREADS", "10"))
+BUF_SIZE = 5120
+DATA_DIR = '/mnt/ImageNet'
+TRAIN_LIST = '/mnt/ImageNet/train.txt'
+TEST_LIST = '/mnt/ImageNet/val.txt'
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+def resize_short(img, target_size):
+    percent = float(target_size) / min(img.size[0], img.size[1])
+    resized_width = int(round(img.size[0] * percent))
+    resized_height = int(round(img.size[1] * percent))
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    return img
+def crop_image(img, target_size, center):
+    width, height = img.size
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = random.randint(0, width - size)
+        h_start = random.randint(0, height - size)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img.crop((w_start, h_start, w_end, h_end))
+    return img
+def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
+    aspect_ratio = math.sqrt(random.uniform(*ratio))
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+    bound = min((float(img.size[0]) / img.size[1]) / (w**2),
+                (float(img.size[1]) / img.size[0]) / (h**2))
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+    target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
+                                                             scale_max)
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+    i = random.randint(0, img.size[0] - w)
+    j = random.randint(0, img.size[1] - h)
+    img = img.crop((i, j, i + w, j + h))
+    img = img.resize((size, size), Image.LANCZOS)
+    return img
+def rotate_image(img):
+    angle = random.randint(-10, 10)
+    img = img.rotate(angle)
+    return img
+def distort_color(img):
+    def random_brightness(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Brightness(img).enhance(e)
+    def random_contrast(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Contrast(img).enhance(e)
+    def random_color(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Color(img).enhance(e)
+    ops = [random_brightness, random_contrast, random_color]
+    random.shuffle(ops)
+    img = ops[0](img)
+    img = ops[1](img)
+    img = ops[2](img)
+    return img
+def process_image(sample, mode, color_jitter, rotate):
+    img_path = sample[0]
+    img = Image.open(img_path)
+    if mode == 'train':
+        if rotate: img = rotate_image(img)
+        img = random_crop(img, DATA_DIM)
+    else:
+        img = resize_short(img, target_size=256)
+        img = crop_image(img, target_size=DATA_DIM, center=True)
+    if mode == 'train':
+        if color_jitter:
+            img = distort_color(img)
+        if random.randint(0, 1) == 1:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
+    img -= img_mean
+    img /= img_std
+    if mode == 'train' or mode == 'val':
+        return img, sample[1]
+    elif mode == 'test':
+        return [img]
+class XmapEndSignal():
+    pass
+def xmap_readers(mapper,
+                 reader,
+                 process_num,
+                 buffer_size,
+                 order=False,
+                 print_queue_state=True):
+    end = XmapEndSignal()
+    # define a worker to read samples from reader to in_queue
+    def read_worker(reader, in_queue):
+        for i in reader():
+            in_queue.put(i)
+        in_queue.put(end)
+    # define a worker to read samples from reader to in_queue with order flag
+    def order_read_worker(reader, in_queue, file_queue):
+        in_order = 0
+        for i in reader():
+            in_queue.put((in_order, i))
+            in_order += 1
+        in_queue.put(end)
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue
+    def handle_worker(in_queue, out_queue, mapper):
+        sample = in_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            r = mapper(sample)
+            out_queue.put(r)
+            sample = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue by order
+    def order_handle_worker(in_queue, out_queue, mapper, out_order):
+        ins = in_queue.get()
+        while not isinstance(ins, XmapEndSignal):
+            order, sample = ins
+            r = mapper(sample)
+            while order != out_order[0]:
+                pass
+            out_queue.put(r)
+            out_order[0] += 1
+            ins = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+    def xreader():
+        file_queue = Queue()
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+        sample = out_queue.get()
+        start_t = time.time()
+        while not isinstance(sample, XmapEndSignal):
+            yield sample
+            sample = out_queue.get()
+            if time.time() - start_t > 3:
+                if print_queue_state:
+                    print("queue sizes: ", in_queue.qsize(), out_queue.qsize())
+                start_t = time.time()
+        finish = 1
+        while finish < process_num:
+            sample = out_queue.get()
+            if isinstance(sample, XmapEndSignal):
+                finish += 1
+            else:
+                yield sample
+    return xreader
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False,
+                    xmap=True):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(full_lines)
+            if mode == 'train':
+                trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+                trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+                per_node_lines = len(full_lines) / trainer_count
+                lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
+                                   * per_node_lines]
+                print(
+                    "read images from %d, length: %d, lines length: %d, total: %d"
+                    % (trainer_id * per_node_lines, per_node_lines, len(lines),
+                       len(full_lines)))
+            else:
+                lines = full_lines
+            for line in lines:
+                if mode == 'train':
+                    img_path, label = line.split()
+                    img_path = img_path.replace("JPEG", "jpeg")
+                    img_path = os.path.join(DATA_DIR, "train", img_path)
+                    yield (img_path, int(label))
+                elif mode == 'val':
+                    img_path, label = line.split()
+                    img_path = img_path.replace("JPEG", "jpeg")
+                    img_path = os.path.join(DATA_DIR, "val", img_path)
+                    yield (img_path, int(label))
+                elif mode == 'test':
+                    img_path = os.path.join(DATA_DIR, line)
+                    yield [img_path]
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
+def load_raw_image_uint8(sample):
+    img_arr = np.array(Image.open(sample[0])).astype('int64')
+    return img_arr, int(sample[1])
+def train_raw(file_list=TRAIN_LIST, shuffle=True):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(full_lines)
+            trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+            trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+            per_node_lines = len(full_lines) / trainer_count
+            lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1) *
+                               per_node_lines]
+            print("read images from %d, length: %d, lines length: %d, total: %d"
+                  % (trainer_id * per_node_lines, per_node_lines, len(lines),
+                     len(full_lines)))
+            for line in lines:
+                img_path, label = line.split()
+                img_path = img_path.replace("JPEG", "jpeg")
+                img_path = os.path.join(DATA_DIR, "train", img_path)
+                yield (img_path, int(label))
+    return paddle.reader.xmap_readers(load_raw_image_uint8, reader, THREAD,
+                                      BUF_SIZE)
+def train(file_list=TRAIN_LIST, xmap=True):
+    return _reader_creator(
+        file_list,
+        'train',
+        shuffle=True,
+        color_jitter=False,
+        rotate=False,
+        xmap=xmap)
+def val(file_list=TEST_LIST, xmap=True):
+    return _reader_creator(file_list, 'val', shuffle=False, xmap=xmap)
+def test(file_list=TEST_LIST):
+    return _reader_creator(file_list, 'test', shuffle=False)
+if __name__ == "__main__":
+    c = 0
+    start_t = time.time()
+    for d in train()():
+        c += 1
+        if c >= 10000:
+            break
+    spent = time.time() - start_t
+    print("read 10000 speed: ", 10000 / spent, spent)
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -163,6 +163,19 @@ def gen_job():
        volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
        volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
+    # add ceph volumes
+    volumes.append({
+        "name": "ceph-data",
+        "cephfs": {
+            "monitors": ["192.168.16.23:6789"],
+            "secretRef": {
+                "name": "ceph-secret"
+            },
+            "user": "admin",
+        }
+    })
+    volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
    tn["spec"]["template"]["spec"]["volumes"] = volumes
    tn_container["volumeMounts"] = volumeMounts

--- a/benchmark/fluid/models/__init__.py
+++ b/benchmark/fluid/models/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.
 __all__ = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
+    "resnet_with_preprocess"
 ]
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """seq2seq model for fluid."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -181,7 +182,7 @@ def lodtensor_to_ndarray(lod_tensor):
    return ndarray
-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
    if args.use_reader_op:
        raise Exception("machine_translation do not support reader op for now.")
    embedding_dim = 512
@@ -190,30 +191,27 @@ def get_model(args):
    dict_size = 30000
    beam_size = 3
    max_length = 250
-    avg_cost, feeding_list = seq_to_seq_net(
-        embedding_dim,
-        encoder_size,
-        decoder_size,
-        dict_size,
-        dict_size,
-        False,
-        beam_size=beam_size,
-        max_length=max_length)
-    # clone from default main program
-    inference_program = fluid.default_main_program().clone()
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    train_batch_generator = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=args.batch_size * args.gpus)
-    test_batch_generator = paddle.batch(
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            avg_cost, feeding_list = seq_to_seq_net(
+                embedding_dim,
+                encoder_size,
+                decoder_size,
+                dict_size,
+                dict_size,
+                False,
+                beam_size=beam_size,
+                max_length=max_length)
+    if is_train:
+        optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+        optimizer.minimize(avg_cost)
+    batch_generator = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.wmt14.test(dict_size), buf_size=1000),
+            paddle.dataset.wmt14.train(dict_size)
-        batch_size=args.batch_size)
+            if is_train else paddle.dataset.wmt14.test(dict_size),
+            buf_size=1000),
+        batch_size=args.batch_size * args.gpus)
-    return avg_cost, inference_program, optimizer, train_batch_generator, \
+    return avg_cost, optimizer, [], batch_generator, None
-           test_batch_generator, None
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -65,61 +65,50 @@ def cnn_model(data):
    return predict
-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
-    if args.use_reader_op:
+    # NOTE: mnist is small, we don't implement data sharding yet.
-        filelist = [
+    filelist = [
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-        ]
+    ]
-        data_file = fluid.layers.open_files(
+    with fluid.program_guard(main_prog, startup_prog):
-            filenames=filelist,
+        if args.use_reader_op:
-            shapes=[[-1, 1, 28, 28], (-1, 1)],
+            data_file_handle = fluid.layers.open_files(
-            lod_levels=[0, 0],
+                filenames=filelist,
-            dtypes=["float32", "int64"],
+                shapes=[[-1, 1, 28, 28], (-1, 1)],
-            thread_num=args.gpus,
+                lod_levels=[0, 0],
-            pass_num=args.pass_num)
+                dtypes=["float32", "int64"],
-        data_file = fluid.layers.double_buffer(
+                thread_num=1,
-            fluid.layers.batch(
+                pass_num=1)
-                data_file, batch_size=args.batch_size))
+            data_file = fluid.layers.double_buffer(
-        images, label = fluid.layers.read_file(data_file)
+                fluid.layers.batch(
-    else:
+                    data_file_handle, batch_size=args.batch_size))
-        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        with fluid.unique_name.guard():
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            if args.use_reader_op:
+                input, label = fluid.layers.read_file(data_file)
-    if args.device == 'CPU' and args.cpus > 1:
+            else:
-        places = fluid.layers.get_places(args.cpus)
+                images = fluid.layers.data(
-        pd = fluid.layers.ParallelDo(places)
+                    name='pixel', shape=[1, 28, 28], dtype='float32')
-        with pd.do():
+                label = fluid.layers.data(
-            predict = cnn_model(pd.read_input(images))
+                    name='label', shape=[1], dtype='int64')
-            label = pd.read_input(label)
+            predict = cnn_model(images)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
+            # Evaluator
            batch_acc = fluid.layers.accuracy(input=predict, label=label)
+            # Optimization
-            pd.write_output(avg_cost)
+            if is_train:
-            pd.write_output(batch_acc)
+                opt = fluid.optimizer.AdamOptimizer(
+                    learning_rate=0.001, beta1=0.9, beta2=0.999)
-        avg_cost, batch_acc = pd()
+                opt.minimize()
-        avg_cost = fluid.layers.mean(avg_cost)
+                if args.memory_optimize:
-        batch_acc = fluid.layers.mean(batch_acc)
+                    fluid.memory_optimize(main_prog)
-    else:
-        # Train program
-        predict = cnn_model(images)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-        # Evaluator
-        batch_acc = fluid.layers.accuracy(input=predict, label=label)
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    # Optimization
-    opt = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.001, beta1=0.9, beta2=0.999)
    # Reader
-    train_reader = paddle.batch(
+    if is_train:
-        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
+        reader = paddle.dataset.mnist.train()
-    test_reader = paddle.batch(
+    else:
-        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+        reader = paddle.dataset.mnist.test()
-    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
+    batched_reader = paddle.batch(
+        reader, batch_size=args.batch_size * args.gpus)
+    return avg_cost, opt, [batch_acc], batched_reader, data_file_handle
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -27,10 +27,17 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
-from recordio_converter import imagenet_train, imagenet_test
+# from recordio_converter import imagenet_train, imagenet_test
+from imagenet_reader import train, val
-def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  act='relu',
+                  is_train=True):
    conv1 = fluid.layers.conv2d(
        input=input,
        filter_size=filter_size,
@@ -39,29 +46,31 @@ def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
        padding=padding,
        act=None,
        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv1, act=act)
+    return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
-def shortcut(input, ch_out, stride):
+def shortcut(input, ch_out, stride, is_train=True):
    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
    if ch_in != ch_out:
-        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        return conv_bn_layer(
+            input, ch_out, 1, stride, 0, None, is_train=is_train)
    else:
        return input
-def basicblock(input, ch_out, stride):
+def basicblock(input, ch_out, stride, is_train=True):
-    short = shortcut(input, ch_out, stride)
+    short = shortcut(input, ch_out, stride, is_train=is_train)
-    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-def bottleneck(input, ch_out, stride):
+def bottleneck(input, ch_out, stride, is_train=True):
-    short = shortcut(input, ch_out * 4, stride)
+    short = shortcut(input, ch_out * 4, stride, is_train=is_train)
-    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
-    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+    conv3 = conv_bn_layer(
+        conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
@@ -72,7 +81,11 @@ def layer_warp(block_func, input, ch_out, count, stride):
    return res_out
-def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+def resnet_imagenet(input,
+                    class_dim,
+                    depth=50,
+                    data_format='NCHW',
+                    is_train=True):
    cfg = {
        18: ([2, 2, 2, 1], basicblock),
@@ -115,8 +128,9 @@ def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
    return out
-def get_model(args):
+def _model_reader_dshape_classdim(args, is_train):
    model = resnet_cifar10
+    reader = None
    if args.data_set == "cifar10":
        class_dim = 10
        if args.data_format == 'NCHW':
@@ -124,8 +138,10 @@ def get_model(args):
        else:
            dshape = [32, 32, 3]
        model = resnet_cifar10
-        train_reader = paddle.dataset.cifar.train10()
+        if is_train:
-        test_reader = paddle.dataset.cifar.test10()
+            reader = paddle.dataset.cifar.train10()
+        else:
+            reader = paddle.dataset.cifar.test10()
    elif args.data_set == "flowers":
        class_dim = 102
        if args.data_format == 'NCHW':
@@ -133,8 +149,10 @@ def get_model(args):
        else:
            dshape = [224, 224, 3]
        model = resnet_imagenet
-        train_reader = paddle.dataset.flowers.train()
+        if is_train:
-        test_reader = paddle.dataset.flowers.test()
+            reader = paddle.dataset.flowers.train()
+        else:
+            reader = paddle.dataset.flowers.test()
    elif args.data_set == "imagenet":
        class_dim = 1000
        if args.data_format == 'NCHW':
@@ -145,64 +163,89 @@ def get_model(args):
        if not args.data_path:
            raise Exception(
                "Must specify --data_path when training with imagenet")
-        train_reader = imagenet_train(args.data_path)
+        if not args.use_reader_op:
-        test_reader = imagenet_test(args.data_path)
+            if is_train:
+                reader = train()
-    if args.use_reader_op:
+            else:
-        filelist = [
+                reader = val()
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        else:
-        ]
+            if is_train:
-        data_file = fluid.layers.open_files(
+                reader = train(xmap=False)
-            filenames=filelist,
+            else:
-            shapes=[[-1] + dshape, (-1, 1)],
+                reader = val(xmap=False)
-            lod_levels=[0, 0],
+    return model, reader, dshape, class_dim
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
+def get_model(args, is_train, main_prog, startup_prog):
-        data_file = fluid.layers.double_buffer(
+    model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
-            fluid.layers.batch(
+                                                                     is_train)
-                data_file, batch_size=args.batch_size))
-        input, label = fluid.layers.read_file(data_file)
+    pyreader = None
-    else:
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
-        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+    with fluid.program_guard(main_prog, startup_prog):
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
-    if args.device == 'CPU' and args.cpus > 1:
+                pyreader = fluid.layers.py_reader(
-        places = fluid.layers.get_places(args.cpus)
+                    capacity=args.batch_size * args.gpus,
-        pd = fluid.layers.ParallelDo(places)
+                    shapes=([-1] + dshape, (-1, 1)),
-        with pd.do():
+                    dtypes=('float32', 'int64'),
-            predict = model(pd.read_input(input), class_dim)
+                    name="train_reader" if is_train else "test_reader",
-            label = pd.read_input(label)
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+            predict = model(input, class_dim, is_train=is_train)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
-            batch_acc = fluid.layers.accuracy(input=predict, label=label)
-            pd.write_output(avg_cost)
-            pd.write_output(batch_acc)
-        avg_cost, batch_acc = pd()
+            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
-        avg_cost = fluid.layers.mean(avg_cost)
+            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
-        batch_acc = fluid.layers.mean(batch_acc)
+            # configure optimize
+            optimizer = None
+            if is_train:
+                if args.use_lars:
+                    lars_decay = 1.0
+                else:
+                    lars_decay = 0.0
+                total_images = 1281167 / trainer_count
+                step = int(total_images / args.batch_size + 1)
+                epochs = [30, 60, 80, 90]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    learning_rate=base_lr,
+                    #learning_rate=fluid.layers.piecewise_decay(
+                    #    boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+    # config readers
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader if args.no_random else paddle.reader.shuffle(
+                reader, buf_size=5120),
+            batch_size=args.batch_size * args.gpus,
+            drop_last=True)
    else:
-        predict = model(input, class_dim)
+        batched_reader = None
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        pyreader.decorate_paddle_reader(
-        avg_cost = fluid.layers.mean(x=cost)
+            paddle.batch(
-        batch_acc = fluid.layers.accuracy(input=predict, label=label)
+                reader if args.no_random else paddle.reader.shuffle(
+                    reader, buf_size=5120),
-    inference_program = fluid.default_main_program().clone()
+                batch_size=args.batch_size))
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
+    return avg_cost, optimizer, [batch_acc1,
-            target_vars=[batch_acc])
+                                 batch_acc5], batched_reader, pyreader
-    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-    batched_train_reader = paddle.batch(
-        train_reader if args.no_random else paddle.reader.shuffle(
-            train_reader, buf_size=5120),
-        batch_size=args.batch_size * args.gpus,
-        drop_last=True)
-    batched_test_reader = paddle.batch(
-        test_reader, batch_size=args.batch_size, drop_last=True)
-    return avg_cost, inference_program, optimizer, batched_train_reader,\
-                   batched_test_reader, batch_acc
--- a/benchmark/fluid/models/resnet_with_preprocess.py
+++ b/benchmark/fluid/models/resnet_with_preprocess.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import functools
+import numpy as np
+import time
+import os
+import cProfile, pstats, StringIO
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+# from recordio_converter import imagenet_train, imagenet_test
+from imagenet_reader import train_raw, val
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  act='relu',
+                  is_train=True):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
+def shortcut(input, ch_out, stride, is_train=True):
+    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(
+            input, ch_out, 1, stride, 0, None, is_train=is_train)
+    else:
+        return input
+def basicblock(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+def bottleneck(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out * 4, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
+    conv3 = conv_bn_layer(
+        conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+def resnet_imagenet(input,
+                    class_dim,
+                    depth=50,
+                    data_format='NCHW',
+                    is_train=True):
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) // 6
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+def _model_reader_dshape_classdim(args, is_train):
+    model = resnet_cifar10
+    reader = None
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+        model = resnet_cifar10
+        if is_train:
+            reader = paddle.dataset.cifar.train10()
+        else:
+            reader = paddle.dataset.cifar.test10()
+    elif args.data_set == "flowers":
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if is_train:
+            reader = paddle.dataset.flowers.train()
+        else:
+            reader = paddle.dataset.flowers.test()
+    elif args.data_set == "imagenet":
+        class_dim = 1000
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if not args.data_path:
+            raise Exception(
+                "Must specify --data_path when training with imagenet")
+        if not args.use_reader_op:
+            if is_train:
+                reader = train_raw()
+            else:
+                reader = val()
+        else:
+            if is_train:
+                reader = train_raw()
+            else:
+                reader = val(xmap=False)
+    return model, reader, dshape, class_dim
+def get_model(args, is_train, main_prog, startup_prog):
+    model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
+                                                                     is_train)
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=args.batch_size * args.gpus,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('uint8', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='uint8')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+            # add imagenet preprocessors
+            random_crop = fluid.layers.random_crop(input, dshape)
+            casted = fluid.layers.cast(random_crop, 'float32')
+            # input is HWC
+            trans = fluid.layers.transpose(casted, [0, 3, 1, 2]) / 255.0
+            img_mean = fluid.layers.tensor.assign(
+                np.array([0.485, 0.456, 0.406]).astype('float32').reshape((3, 1,
+                                                                           1)))
+            img_std = fluid.layers.tensor.assign(
+                np.array([0.229, 0.224, 0.225]).astype('float32').reshape((3, 1,
+                                                                           1)))
+            h1 = fluid.layers.elementwise_sub(trans, img_mean, axis=1)
+            h2 = fluid.layers.elementwise_div(h1, img_std, axis=1)
+            # pre_out = (trans - img_mean) / img_std
+            predict = model(h2, class_dim, is_train=is_train)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
+            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
+            # configure optimize
+            optimizer = None
+            if is_train:
+                if args.use_lars:
+                    lars_decay = 1.0
+                else:
+                    lars_decay = 0.0
+                total_images = 1281167 / trainer_count
+                step = int(total_images / args.batch_size + 1)
+                epochs = [30, 60, 80, 90]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    learning_rate=base_lr,
+                    #learning_rate=fluid.layers.piecewise_decay(
+                    #    boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+    # config readers
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader if args.no_random else paddle.reader.shuffle(
+                reader, buf_size=5120),
+            batch_size=args.batch_size * args.gpus,
+            drop_last=True)
+    else:
+        batched_reader = None
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                # reader if args.no_random else paddle.reader.shuffle(
+                #     reader, buf_size=5120),
+                reader,
+                batch_size=args.batch_size))
+    return avg_cost, optimizer, [batch_acc1,
+                                 batch_acc5], batched_reader, pyreader
--- a/benchmark/fluid/models/se_resnext.py
+++ b/benchmark/fluid/models/se_resnext.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.fluid as fluid
+import math
+import os
+from imagenet_reader import train, val
+__all__ = [
+    "SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d",
+    "SE_ResNeXt152_32x4d", "get_model"
+]
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+class SE_ResNeXt():
+    def __init__(self, layers=50, is_train=True):
+        self.params = train_parameters
+        self.layers = layers
+        self.is_train = is_train
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu')
+            conv = self.conv_bn_layer(
+                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio)
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(input=drop,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(input, ch_out, filter_size, stride)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters, stride, cardinality,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio)
+        short = self.shortcut(input, num_filters * 2, stride)
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(
+            input=conv, act=act, is_test=not self.is_train)
+    def squeeze_excitation(self, input, num_channels, reduction_ratio):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(input=pool,
+                                  size=num_channels / reduction_ratio,
+                                  act='relu',
+                                  param_attr=fluid.param_attr.ParamAttr(
+                                      initializer=fluid.initializer.Uniform(
+                                          -stdv, stdv)))
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(input=squeeze,
+                                     size=num_channels,
+                                     act='sigmoid',
+                                     param_attr=fluid.param_attr.ParamAttr(
+                                         initializer=fluid.initializer.Uniform(
+                                             -stdv, stdv)))
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+def SE_ResNeXt50_32x4d():
+    model = SE_ResNeXt(layers=50)
+    return model
+def SE_ResNeXt101_32x4d():
+    model = SE_ResNeXt(layers=101)
+    return model
+def SE_ResNeXt152_32x4d():
+    model = SE_ResNeXt(layers=152)
+    return model
+def get_model(args, is_train, main_prog, startup_prog):
+    model = SE_ResNeXt(layers=50)
+    batched_reader = None
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    dshape = train_parameters["input_size"]
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=10,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('float32', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+            out = model.net(input=input)
+            cost = fluid.layers.cross_entropy(input=out, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+            acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+            optimizer = None
+            if is_train:
+                if args.use_lars:
+                    lars_decay = 1.0
+                else:
+                    lars_decay = 0.0
+                total_images = 1281167 / trainer_count
+                step = int(total_images / args.batch_size + 1)
+                epochs = [40, 80, 100]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    # learning_rate=base_lr,
+                    learning_rate=fluid.layers.piecewise_decay(
+                        boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4),
+                    LARS_weight_decay=lars_decay)
+                optimizer.minimize(avg_cost)
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+    # config readers
+    if is_train:
+        reader = train()
+    else:
+        reader = val()
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader, batch_size=args.batch_size * args.gpus, drop_last=True)
+    else:
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                reader, batch_size=args.batch_size))
+    return avg_cost, optimizer, [acc_top1, acc_top5], batched_reader, pyreader
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -26,7 +26,6 @@ import numpy
 import paddle
 import paddle.dataset.imdb as imdb
 import paddle.fluid as fluid
-import paddle.batch as batch
 import paddle.fluid.profiler as profiler
 word_dict = imdb.word_dict()
@@ -43,19 +42,7 @@ def crop_sentence(reader, crop_size):
    return __impl__
-def get_model(args):
+def lstm_net(sentence, lstm_size):
-    if args.use_reader_op:
-        raise Exception(
-            "stacked_dynamic_lstm do not support reader op for now.")
-    lstm_size = 512
-    emb_dim = 512
-    crop_size = 1500
-    data = fluid.layers.data(
-        name="words", shape=[1], lod_level=1, dtype='int64')
-    sentence = fluid.layers.embedding(
-        input=data, size=[len(word_dict), emb_dim])
    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
    rnn = fluid.layers.DynamicRNN()
@@ -97,31 +84,47 @@ def get_model(args):
    last = fluid.layers.sequence_pool(rnn(), 'last')
    logit = fluid.layers.fc(input=last, size=2, act='softmax')
-    loss = fluid.layers.cross_entropy(
+    return logit
-        input=logit,
-        label=fluid.layers.data(
-            name='label', shape=[1], dtype='int64'))
-    loss = fluid.layers.mean(x=loss)
-    # add acc
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
-                shape=[1], dtype='int64'), total=batch_size_tensor)
-    inference_program = fluid.default_main_program().clone()
+def get_model(args, is_train, main_prog, startup_prog):
-    with fluid.program_guard(inference_program):
+    if args.use_reader_op:
-        inference_program = fluid.io.get_inference_program(
+        raise Exception(
-            target_vars=[batch_acc, batch_size_tensor])
+            "stacked_dynamic_lstm do not support reader op for now.")
+    lstm_size = 512
-    adam = fluid.optimizer.Adam()
+    emb_dim = 512
+    crop_size = 1500
-    train_reader = batch(
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            data = fluid.layers.data(
+                name="words", shape=[1], lod_level=1, dtype='int64')
+            sentence = fluid.layers.embedding(
+                input=data, size=[len(word_dict), emb_dim])
+            logit = lstm_net(sentence, lstm_size)
+            loss = fluid.layers.cross_entropy(
+                input=logit,
+                label=fluid.layers.data(
+                    name='label', shape=[1], dtype='int64'))
+            loss = fluid.layers.mean(x=loss)
+            # add acc
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                        shape=[1], dtype='int64'), total=batch_size_tensor)
+            if is_train:
+                adam = fluid.optimizer.Adam()
+                adam.minimize(loss)
+    if is_train:
+        reader = crop_sentence(imdb.train(word_dict), crop_size)
+    else:
+        reader = crop_sentence(imdb.test(word_dict), crop_size)
+    batched_reader = paddle.batch(
        paddle.reader.shuffle(
-            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
+            reader, buf_size=25000),
        batch_size=args.batch_size * args.gpus)
-    test_reader = batch(
-        paddle.reader.shuffle(
-            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
-        batch_size=args.batch_size)
-    return loss, inference_program, adam, train_reader, test_reader, batch_acc
+    return loss, adam, [batch_acc], batched_reader, None
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -25,7 +25,7 @@ import functools
 import os
-def vgg16_bn_drop(input):
+def vgg16_bn_drop(input, is_train=True):
    def conv_block(input, num_filter, groups, dropouts):
        return fluid.nets.img_conv_group(
            input=input,
@@ -46,13 +46,13 @@ def vgg16_bn_drop(input):
    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    bn = fluid.layers.batch_norm(input=fc1, act='relu', is_test=not is_train)
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
    return fc2
-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
    if args.data_set == "cifar10":
        classdim = 10
        if args.data_format == 'NCHW':
@@ -65,57 +65,56 @@ def get_model(args):
            data_shape = [3, 224, 224]
        else:
            data_shape = [224, 224, 3]
+    filelist = [
+        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+    ]
+    with fluid.program_guard(main_prog, startup_prog):
+        if args.use_reader_op:
+            data_file_handle = fluid.layers.open_files(
+                filenames=filelist,
+                shapes=[[-1] + data_shape, (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"],
+                thread_num=1,
+                pass_num=1)
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(
+                    data_file_handle, batch_size=args.batch_size))
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                images, label = fluid.layers.read_file(data_file)
+            else:
+                images = fluid.layers.data(
+                    name='data', shape=data_shape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+            # Train program
+            net = vgg16_bn_drop(images, is_train=is_train)
+            predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
-    if args.use_reader_op:
+            # Evaluator
-        filelist = [
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+            batch_acc = fluid.layers.accuracy(
-        ]
+                input=predict, label=label, total=batch_size_tensor)
-        data_file = fluid.layers.open_files(
+            # Optimization
-            filenames=filelist,
+            if is_train:
-            shapes=[[-1] + data_shape, (-1, 1)],
+                optimizer = fluid.optimizer.Adam(
-            lod_levels=[0, 0],
+                    learning_rate=args.learning_rate)
-            dtypes=["float32", "int64"],
+                optimizer.minimize(avg_cost)
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        images, label = fluid.layers.read_file(data_file)
-    else:
-        images = fluid.layers.data(
-            name='data', shape=data_shape, dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
    # data reader
-    train_reader = paddle.batch(
+    if is_train:
+        reader = paddle.dataset.cifar.train10() \
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train()
+    else:
+        reader = paddle.dataset.cifar.test10() \
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test()
+    batched_reader = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
+            reader, buf_size=5120),
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
        batch_size=args.batch_size * args.gpus)
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=args.batch_size)
-    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
+    return avg_cost, optimizer, [batch_acc], batched_reader, data_file_handle
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -16,16 +16,6 @@ set(ANAKIN_LIBRARY     ${ANAKIN_INSTALL_DIR})
 set(ANAKIN_SHARED_LIB  ${ANAKIN_LIBRARY}/libanakin.so)
 set(ANAKIN_SABER_LIB   ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
-# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
-set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
-set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
-set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
-set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
 include_directories(${ANAKIN_INCLUDE})
 include_directories(${ANAKIN_INCLUDE}/saber/)
 include_directories(${ANAKIN_INCLUDE}/saber/core/)
@@ -48,21 +38,24 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
    -Wno-reorder
    -Wno-error=cpp)
+if(WITH_GPU)
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=YES -DCUDNN_ROOT=${CUDNN_ROOT} -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR})
+else()
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=NO)
+endif()
 ExternalProject_Add(
    extern_anakin
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLML_PROJECT}
    GIT_REPOSITORY      "https://github.com/PaddlePaddle/Anakin"
-    GIT_TAG             "9424277cf9ae180a14aff09560d3cd60a49c76d2"
+    GIT_TAG             "3c8554f4978628183566ab7dd6c1e7e66493c7cd"
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
-    CMAKE_ARGS          -DUSE_GPU_PLACE=YES
+    CMAKE_ARGS          ${CMAKE_ARGS_PREFIX}
                        -DUSE_X86_PLACE=YES
                        -DBUILD_WITH_UNIT_TEST=NO
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                        -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-                        -DCUDNN_ROOT=${CUDNN_ROOT}
-                        -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
                        -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
                        ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -145,12 +145,12 @@ copy(memory_lib
 set(inference_deps paddle_fluid_shared paddle_fluid)
 set(module "inference/api")
-if (WITH_ANAKIN AND WITH_GPU)
+if (WITH_ANAKIN AND WITH_MKL)
    copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
        SRCS
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
        ${ANAKIN_INSTALL_DIR} # anakin release
-        DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
+        DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
     list(APPEND inference_deps anakin_inference_lib)
 endif()

--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -822,6 +822,14 @@ pad
 ..  autofunction:: paddle.fluid.layers.pad
    :noindex:
+.. _api_fluid_layers_pad_constant_like:
+pad_constant_like
+---
+..  autofunction:: paddle.fluid.layers.pad_constant_like
+    :noindex:
 .. _api_fluid_layers_label_smooth:
 label_smooth
@@ -1145,6 +1153,14 @@ sigmoid
 ..  autofunction:: paddle.fluid.layers.sigmoid
    :noindex:
+.. _api_fluid_layers_hsigmoid:
+hsigmoid
+-------
+..  autofunction:: paddle.fluid.layers.hsigmoid
+    :noindex:
 .. _api_fluid_layers_logsigmoid:
 logsigmoid

--- a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
+++ b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
@@ -104,6 +104,7 @@ visualDL --logdir=scratch_log --port=8080
 # 访问 http://127.0.0.1:8080
 ```
+如果出现`TypeError: __init__() got an unexpected keyword argument 'file'`, 是因为protobuf不是3.5以上，运行`pip install --upgrade protobuf`就能解决。
 如果在虚拟环境下仍然遇到安装问题，请尝试以下方法。

--- a/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
+++ b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
@@ -4,13 +4,12 @@ Paddle 预测 API
 为了更简单方便的预测部署，Fluid 提供了一套高层 API
 用来隐藏底层不同的优化实现。
-`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__
+`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`_
 包括
 -  头文件 ``paddle_inference_api.h`` 定义了所有的接口
 -  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
-  库文件 ``libpaddle_inference_api.so`` 或
-   ``libpaddle_inference_api.a``
 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
@@ -97,8 +96,7 @@ engine
    CHECK(predictor->Run(slots, &outputs));
    // 获取 outputs ...
-编译时，联编 ``libpaddle_fluid.a/.so`` 和
+编译时，联编 ``libpaddle_fluid.a/.so`` 便可。
-``libpaddle_inference_api.a/.so`` 便可。
 详细代码参考
 ------------

--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
@@ -2,42 +2,47 @@
 ## Automatic Differentiation
-A key challenge in the field of deep learning is to automatically derive the backward pass from the forward pass described algorithmically by researchers.  Such a derivation, or a transformation of the forward pass program, has been long studied before the recent prosperity of deep learning in the field known as [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf).
+A key challenge in deep learning is to automatically derive the backward pass given the forward pass as a program, which has been long studied in the field of [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf), or autodiff, before the prosperity of deep learning.
-## The Tape
+## Program Transformation v.s. Backtracking
-Given the forward pass program (usually in Python in practices), there are two strategies to derive the backward pass:
+Given the forward pass program, there are two strategies to derive the backward pass:
-1. from the forward pass program itself, or
+1. by transforming the forward pass program without executing it, or
-1. from the execution trace of the forward pass program, which is often known as the *tape*.
+1. by backtracking the execution process of the forward pass program.
-This article surveys systems that follow the latter strategy.
+This article is about the latter strategy. 
-## Dynamic Network
+## The Tape and Dynamic Networks
-When we train a deep learning model, the tape changes every iteration as the input data change, so we have to re-derive the backward pass every iteration.  This is known as *dynamic network*.
+We refer to the trace of the execution of the forward pass program as a *tape* [[1]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf).  When we train a deep learning model, the tape changes every iteration as the input data change, so we'd have to re-derive the backward pass, which is time-consuming, but also eases the case that the forward program includes control flows like if-else and for/while. With these control flows, the execution trace might change with iterations.  Such changes are known as *dynamic networks* in the field of deep learning.
-Deep learning systems that utilize the idea of dynamic network gained their popularities in recent years.  This article surveys two representative systems: [PyTorch](https://pytorch.org/) and [DyNet](https://dynet.readthedocs.io/en/latest/).
+## Typical Systems
-## An Overview
+Deep learning systems that utilize the idea of dynamic networks gained their popularities in recent years.  This article surveys the following typical systems: 
-Both frameworks record a ‘tape’ of the computation and interpreting (or run-time compiling) a transformation of the tape played back in reverse. This tape is a different kind of entity than the original program.[[link]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf)
+- [DyNet](https://dynet.readthedocs.io/en/latest/)
+- [PyTorch](https://pytorch.org/)
+- Chainer
+- Autograd from HIPS
-Consider the following code feedforward model.
+Before diving into these systems, let us pose an example forward pass program:
 ```python
 x = Variable(randn(20, 1)))
 label = Variable(randint(1))
 W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20))
 h = matmul(W_1, x)
-pred = matmul(W_2, x)
+pred = matmul(W_2, h)
 loss = softmax(pred, label)
 loss.backward()
 ```
-### 1) Dynet uses List to encode the Tape
+## The Representation of Tapes
-During the forward execution, a list of operators, in this case `matmul`, `matmul` and `softmax`, are recorded in the tape, along with the necessary information needed to do the backward such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward()`.
+### DyNet: the Tape as a List
+DyNet uses a linear data structure, a list, to represent the tape. During the execution of the above example, it is a list of operators: `matmul`, `matmul`, and `softmax`.  The list also includes information needed to do the backward pass, such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward().`
 <details> 
 <summary></summary>
@@ -69,9 +74,9 @@ digraph g {
 ![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20})
-### 2) Pytorch uses Node Graph to encode the Tape
+### PyTorch: the Tape as a Graph
-The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.
+The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.  Please be aware that a `Function` might have more than one `prev_func`s.
 <details> 
 <summary></summary>
@@ -132,27 +137,22 @@ digraph g {
 ![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20})
-Chainer and Autograd uses the similar techniques to record the forward pass. For details please refer to the appendix.
+Chainer and Autograd use the similar techniques to record the forward pass. For details, please refer to the appendix.
-## Design choices
-### 1) Dynet's List vs Pytorch's Node Graph
+## Comparison: List v.s. Graph
-What's good about List:
+The list of DyNet could be considered the result of the topological sort of the graph of PyTorch. Or, the graph is the raw representation of the tape, which gives us the chance to *prune* part of the graph that is irrelevant with the backward pass before the topological sort [[2]](https://openreview.net/pdf?id=BJJsrmfCZ). Consider the following example, PyTorch only does backward on `SmallNet` while DyNet does both `SmallNet` and `BigNet`:
-1. It avoids a topological sort. One only needs to traverse the list of operators in reverse and calling the corresponding backward operator.
-1. It promises effient data parallelism implementations. One could count the time of usage of a certain variable during the construction list. Then in the play back, one knows the calculation of a variable has completed. This enables communication and computation overlapping.
-What's good about Node Graph:
-1. More flexibility. PyTorch users can mix and match independent graphs however they like, in whatever threads they like (without explicit synchronization). An added benefit of structuring graphs this way is that when a portion of the graph becomes dead, it is automatically freed. [[2]](https://openreview.net/pdf?id=BJJsrmfCZ) Consider the following example, Pytorch only does backward on SmallNet while Dynet does both BigNet and SmallNet.
 ```python
 result = BigNet(data)
 loss = SmallNet(data)
 loss.backward()
 ```
-### 2) Dynet's Lazy evaluation vs Pytorch's Immediate evaluation
+## Lazy v.s. Immediate Evaluation
+Another difference between DyNet and PyTorch is that DyNet lazily evaluates the forward pass, whereas PyTorch executes it immediately. Consider the following example:
-Dynet builds the list in a symbolic matter. Consider the following example
 ```python
 for epoch in range(num_epochs):
    for in_words, out_label in training_data:
@@ -164,16 +164,17 @@ for epoch in range(num_epochs):
        loss_val = loss_sym.value()
        loss_sym.backward()
 ```
 The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion.
-Pytorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
+PyTorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
-## What can fluid learn from them?
+## Fluid: Learning the Lessons
 Please refer to `paddle/contrib/dynamic/`.
-# Appendix
+## Appendix
 ### Overview

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -43,6 +43,7 @@ paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list',
 paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
+paddle.fluid.Trainer.save_inference_model ArgSpec(args=['self', 'param_path', 'feeded_var_names', 'target_var_indexes'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None)
@@ -65,7 +66,7 @@ paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'pla
 paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
 paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
-paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0))
+paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0, None))
 paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
@@ -312,7 +313,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
-paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
+paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -376,7 +377,7 @@ paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'l
 paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5))
 paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0))
+paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False))
 paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -326,7 +326,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
  ir::Graph &result = *graph;
  for (auto &node : nodes) {
-    if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) {
+    if (node->IsVar() && node->Var()) {
      all_vars_.emplace(node->Name(), node->Var());
    }
  }
@@ -583,18 +583,6 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
  }
 }
-bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
-    const std::string &og,
-    std::unordered_set<std::string> *og_has_been_broadcast) const {
-  bool is_pg_once =
-      grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
-  if (is_pg_once) {
-    // Insert NCCL AllReduce Op
-    og_has_been_broadcast->insert(og);
-  }
-  return is_pg_once;
-}
 int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
                                           ir::Node *node) const {
  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
@@ -688,20 +676,6 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
  return var;
 }
-// Find the first occurence of `prev_op_name` and make current `op` depend
-// on it.
-void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
-                                        const std::string &prev_op_name) const {
-  for (auto &prev_op : result->Get<GraphOps>(kGraphOps)) {
-    if (prev_op->Name() == prev_op_name) {
-      auto *dep_var = new DummyVarHandle(result->CreateControlDepVar());
-      prev_op->AddOutput(dep_var);
-      result->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-      op->AddInput(dep_var);
-    }
-  }
-}
 void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
                                                ir::Node *node) const {
  int op_dev_id = -1;

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -69,9 +69,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  std::vector<std::string> FindDistTrainRecvVars(
      const std::vector<ir::Node *> &nodes) const;
-  void ConnectOp(ir::Graph *result, OpHandleBase *op,
-                 const std::string &prev_op_name) const;
  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                              size_t num_places) const;
@@ -83,10 +80,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  void CreateComputationalOp(ir::Graph *result, ir::Node *node,
                             int dev_id) const;
-  bool IsParameterGradientOnce(
-      const std::string &og,
-      std::unordered_set<std::string> *og_has_been_broadcast) const;
  int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;
  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
 set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
 file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt.  DO NOT EDIT!\n\n")
 file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
-function(pass_library TARGET)
+# Usage: pass_library(target inference) will append to paddle_inference_pass.h
+function(pass_library TARGET DEST)
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass)
+    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass ${op_library_DEPS})
-    file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
+    # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
-    set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
+    if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
+        message(STATUS "add pass ${TARGET} ${DEST}")
+        file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
+        set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
+    endif()
 endfunction()
 cc_library(node SRCS node.cc DEPS proto_desc)
@@ -18,13 +25,15 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
-pass_library(graph_to_program_pass)
+pass_library(graph_to_program_pass base)
-pass_library(graph_viz_pass)
+pass_library(graph_viz_pass base)
-pass_library(fc_fuse_pass)
+pass_library(fc_fuse_pass inference)
-pass_library(attention_lstm_fuse_pass)
+pass_library(attention_lstm_fuse_pass inference)
-pass_library(infer_clean_graph_pass)
+pass_library(infer_clean_graph_pass inference)
-pass_library(fc_lstm_fuse_pass)
+pass_library(fc_lstm_fuse_pass inference)
-pass_library(seq_concat_fc_fuse_pass)
+pass_library(fc_gru_fuse_pass inference)
+pass_library(seq_concat_fc_fuse_pass inference)
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)

--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -13,13 +13,10 @@
 // limitations under the License.
 #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/api/helper.h"
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+static void BuildPattern(PDPattern* pattern, const std::string& name_scope,
+                         bool with_fc_bias) {
+  PDNode* x = pattern->NewNode(name_scope, "x")
+                  ->assert_is_op_input("mul")
+                  ->assert_var_not_persistable();
+  auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias);
+  fc_out->AsIntermediate();  // fc_out is a tmp var, will be removed after fuse.
+  patterns::GRU(pattern, name_scope, fc_out);
+  VLOG(3) << "fc_gru pattern \n" << pattern->DotString();
+}
+static int BuildFusion(Graph* graph, const std::string& name_scope,
+                       Scope* scope, bool with_fc_bias) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+  BuildPattern(pattern, name_scope, with_fc_bias);
+  // Create New OpDesc
+  auto gru_creater = [&](int gru, int x, int weight_x, int weight_h, int bias,
+                         int hidden, int fc_bias) {
+#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
+    GET_NODE(x);
+    GET_NODE(weight_x);
+    GET_NODE(weight_h);
+    GET_NODE(bias);
+    GET_NODE(hidden);
+    GET_NODE(gru);
+    OpDesc op_desc;
+    op_desc.SetType("fusion_gru");
+#define NEW_NAME(x) name_scope + "/at." #x ".new"
+#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()});
+    SET_IN(X, x);
+    SET_IN(WeightX, weight_x);
+    SET_IN(WeightH, weight_h);
+    if (with_fc_bias) {
+      op_desc.SetInput("Bias", {NEW_NAME(bias) + bias_n->Name()});
+    } else {
+      SET_IN(Bias, bias);
+    }
+#undef SET_IN
+    op_desc.SetInput("H0", {});
+    op_desc.SetOutput("Hidden", {hidden_n->Name()});
+    op_desc.SetAttr("is_reverse", gru_n->Op()->GetAttr("is_reverse"));
+    // TODO(TJ): This should be a option for infer
+    op_desc.SetAttr("use_seq", true);
+#define SET_IMTERMEDIATE_OUT(key) op_desc.SetOutput(#key, {NEW_NAME(key)})
+    SET_IMTERMEDIATE_OUT(ReorderedH0);
+    SET_IMTERMEDIATE_OUT(XX);
+    SET_IMTERMEDIATE_OUT(BatchedInput);
+    SET_IMTERMEDIATE_OUT(BatchedOut);
+#undef SET_IMTERMEDIATE_OUT
+    auto* op = graph->CreateOpNode(&op_desc);
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+    PADDLE_ENFORCE(scope);
+    if (with_fc_bias) {
+      // Fusion GRU bias = fcbias + grubias
+      auto* fusion_bias_var = scope->Var(NEW_NAME(bias) + bias_n->Name());
+      auto* out_bias_tensor =
+          fusion_bias_var->GetMutable<framework::LoDTensor>();
+      PADDLE_ENFORCE(fusion_bias_var);
+      GET_NODE(fc_bias);
+      PADDLE_ENFORCE(fc_bias_n);
+      auto* gru_bias_var = scope->FindVar(bias_n->Name());
+      auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
+      PADDLE_ENFORCE(gru_bias_var);
+      PADDLE_ENFORCE(fc_bias_var);
+      const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
+      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
+      // new bias = fc bias + gru bias
+      out_bias_tensor->Resize(gru_bias_tenosr.dims());
+      auto* data = out_bias_tensor->mutable_data<float>(platform::CPUPlace());
+      for (int i = 0; i < out_bias_tensor->numel(); i++) {
+        data[i] =
+            fc_bias_tensor.data<float>()[i] + gru_bias_tenosr.data<float>()[i];
+      }
+    }
+#undef GET_NODE
+#define NEW_IMTERMEDIATE_OUT(key) \
+  scope->Var(NEW_NAME(key))->GetMutable<framework::LoDTensor>()
+    NEW_IMTERMEDIATE_OUT(ReorderedH0);
+    NEW_IMTERMEDIATE_OUT(XX);
+    NEW_IMTERMEDIATE_OUT(BatchedInput);
+    NEW_IMTERMEDIATE_OUT(BatchedOut);
+#undef NEW_NAME
+#undef NEW_IMTERMEDIATE_OUT
+    IR_NODE_LINK_TO(x_n, op);
+    IR_NODE_LINK_TO(weight_x_n, op);
+    IR_NODE_LINK_TO(weight_h_n, op);
+    IR_NODE_LINK_TO(bias_n, op);  // actually should link to new bias if have
+    IR_NODE_LINK_TO(op, hidden_n);
+    // h0?
+    return op;
+  };
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+#define GET_NODE(name__)                                \
+  std::string name__##key = name_scope + "/" + #name__; \
+  auto* name__##n = pattern->RetrieveNode(name__##key); \
+  PADDLE_ENFORCE(name__##n);                            \
+  PADDLE_ENFORCE(subgraph.count(name__##n));            \
+  Node* name__##_n = subgraph.at(name__##n);            \
+  int name__ __attribute__((unused)) = name__##_n->id();
+    GET_NODE(x);
+    GET_NODE(w);  // fc weight
+    GET_NODE(mul);
+    GET_NODE(fc_out);
+    GET_NODE(Weight);
+    GET_NODE(gru);
+    GET_NODE(Bias);
+    GET_NODE(Hidden);
+    // nodes need be removed
+    GET_NODE(BatchGate);
+    GET_NODE(BatchResetHiddenPrev);
+    GET_NODE(BatchHidden);
+    if (with_fc_bias) {
+      GET_NODE(mul_out);
+      GET_NODE(fc_bias);
+      GET_NODE(elementwise_add);
+      gru_creater(gru, x, w, Weight, Bias, Hidden, fc_bias);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul_n, gru_n, elementwise_add_n, fc_bias_n, fc_out_n, mul_out_n,
+           BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    } else {
+      gru_creater(gru, x, w, Weight, Bias, Hidden, -1);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul_n, gru_n, BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    }
+#undef GET_NODE
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+  return fusion_count;
+}
+std::unique_ptr<ir::Graph> MulGRUFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 false /*with_fc_bias*/);
+  AddStatis(fusion_count);
+  return graph;
+}
+std::unique_ptr<ir::Graph> FCGRUFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 true /*with_fc_bias*/);
+  AddStatis(fusion_count);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
+REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
+class FCGRUFusePass : public FusePassBase {
+ public:
+  virtual ~FCGRUFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"fc_gru_fuse"};
+};
+// Just FC without bias
+class MulGRUFusePass : public FusePassBase {
+ public:
+  virtual ~MulGRUFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"fc_nobias_gru_fuse"};
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -20,12 +20,13 @@ namespace paddle {
 namespace framework {
 namespace ir {
-std::string GenNodeName(const std::string& prefix, const std::string& name) {
+static std::string GenNodeName(const std::string& prefix,
+                               const std::string& name) {
  return prefix + "/" + name;
 }
-void BuildPattern(PDPattern* pattern, const std::string& name_scope,
+static void BuildPattern(PDPattern* pattern, const std::string& name_scope,
-                  bool with_fc_bias) {
+                         bool with_fc_bias) {
  PDNode* x = pattern->NewNode(name_scope, "x")
                  ->assert_is_op_input("mul")
                  ->assert_var_not_persistable();
@@ -35,8 +36,8 @@ void BuildPattern(PDPattern* pattern, const std::string& name_scope,
  // LOG(INFO) << "\n" << pattern->DotString();
 }
-int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
+static int BuildFusion(Graph* graph, const std::string& name_scope,
-                bool with_fc_bias) {
+                       Scope* scope, bool with_fc_bias) {
  GraphPatternDetector gpd;
  auto* pattern = gpd.mutable_pattern();
@@ -87,15 +88,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
      }
      op_desc.SetInput("Bias", {new_bias_var});
    }
 #undef GET_NODE
+    // Create temp variables.
+    scope->Var(name_scope + "/BatchedInput.new")
+        ->GetMutable<framework::LoDTensor>();
+    scope->Var(name_scope + "/BatchCellPreAct.new")
+        ->GetMutable<framework::LoDTensor>();
+    scope->Var(name_scope + "/BatchedGate.new")
+        ->GetMutable<framework::LoDTensor>();
    op_desc.SetInput("H0", {});
    op_desc.SetInput("C0", {});
    op_desc.SetOutput("Hidden", {hidden_n->Name()});
    op_desc.SetOutput("Cell", {cell_n->Name()});
    op_desc.SetOutput("XX", {xx_n->Name()});
-    op_desc.SetOutput("BatchedInput", {"blstm_0.tmp_2"});
+    op_desc.SetOutput("BatchedGate", {name_scope + "/BatchedGate.new"});
+    op_desc.SetOutput("BatchCellPreAct", {name_scope + "/BatchCellPreAct.new"});
+    op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"});
    op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
    op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes"));
    // TODO(TJ): get from attr
@@ -131,8 +141,8 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
  int fusion_count{0};
-  auto fc_no_bias_handler = [&](
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* g) {
+                     Graph* g) {
 #define GET_NODE(name__)                                \
  std::string name__##key = name_scope + "/" + #name__; \
  auto* name__##n = pattern->RetrieveNode(name__##key); \
@@ -153,21 +163,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
    if (with_fc_bias) {
      GET_NODE(fc_bias);
+      GET_NODE(elementwise_add);
      lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul_n, lstm_n, elementwise_add_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
    } else {
      lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
    }
 #undef GET_NODE
-    // Remove unneeded nodes.
-    std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
-    GraphSafeRemoveNodes(graph, marked_nodes);
    ++fusion_count;
  };
-  gpd(graph, fc_no_bias_handler);
+  gpd(graph, handler);
  return fusion_count;
 }

--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -73,7 +73,6 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) {
 void GraphPatternDetector::operator()(Graph* graph,
                                      GraphPatternDetector::handle_t handler) {
  if (!MarkPDNodesInGraph(*graph)) {
-    LOG(INFO) << "Mark failed";
    return;
  }
@@ -86,7 +85,7 @@ void GraphPatternDetector::operator()(Graph* graph,
  LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
  int id = 0;
  for (auto& g : subgraphs) {
-    LOG(INFO) << "optimizing #" << id++ << " subgraph";
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
    handler(g, graph);
  }
 }
@@ -520,76 +519,96 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) {
 PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope,
                     PDNode* x, bool with_bias) {
-  // Create Operators
+  // mul op
-  PDNode* elementwise_add_op{nullptr};
  auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul");
-  if (with_bias) {
-    elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
-                             ->assert_is_op("elementwise_add");
-  }
-  // Create variables
-  // w
  auto* mul_weight_var = pattern->NewNode(name_scope, "w")
                             ->AsInput()
                             ->assert_is_persistable_var()
-                             ->assert_is_op_nth_input("mul", "Y", 0);
+                             ->assert_is_op_input("mul", "Y");
-  PDNode* mul_out_var{nullptr};
+  PDNode* fc_out{nullptr};
  if (with_bias) {
+    PDNode* elementwise_add_op{nullptr};
+    PDNode *mul_out_var{nullptr}, *bias{nullptr};
+    elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
+                             ->assert_is_op("elementwise_add");
    // intermediate variable, will be removed in the IR after fuse.
    mul_out_var = pattern->NewNode(name_scope, "mul_out")
                      ->AsIntermediate()
                      ->assert_is_only_output_of_op("mul")
                      ->assert_is_op_input("elementwise_add");
-  }
-  PDNode *bias{nullptr}, *fc_out{nullptr};
-  if (with_bias) {
    // bias
    bias = pattern->NewNode(name_scope, "fc_bias")
-               ->assert_is_op_input("elementwise_add")
+               ->AsInput()
-               ->AsInput();
+               ->assert_is_op_input("elementwise_add");
    // output
    fc_out = pattern->NewNode(name_scope, "fc_out")
                 ->AsOutput()
                 ->assert_is_op_output("elementwise_add");
+    mul_op->LinksFrom({x, mul_weight_var}).LinksTo({mul_out_var});
+    elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
  } else {
    fc_out = pattern->NewNode(name_scope, "fc_out")
                 ->AsOutput()
                 ->assert_is_op_output("mul");
-  }
-  if (with_bias) {
-    mul_op->LinksFrom({mul_weight_var, x}).LinksTo({mul_out_var});
-    elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
-  } else {
    mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out});
  }
  return fc_out;
 }
+#define NEW_NODE(op__, arg__, io__)                  \
+  auto* arg__ = pattern->NewNode(name_scope, #arg__) \
+                    ->assert_is_op_##io__(#op__, #arg__);
 PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
                       PDNode* x) {
  x->assert_is_op_input("lstm", "Input");
  auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm");
-#define NEW_NODE(arg__, io__)                        \
-  auto* arg__ = pattern->NewNode(name_scope, #arg__) \
-                    ->assert_is_op_##io__("lstm", #arg__);
  // Currently, the H0 and C0 are optional
  // TODO(Superjomn) upgrade the fuse framework to support optional.
  // NEW_NODE(H0, input);
  // NEW_NODE(C0, input);
-  NEW_NODE(Weight, input);
+  NEW_NODE(lstm, Weight, input);
-  NEW_NODE(Bias, input);
+  NEW_NODE(lstm, Bias, input);
-  NEW_NODE(Hidden, output);
+  NEW_NODE(lstm, Hidden, output);
-  NEW_NODE(Cell, output);
+  NEW_NODE(lstm, Cell, output);
-  NEW_NODE(BatchGate, output);
+  NEW_NODE(lstm, BatchGate, output);
-  NEW_NODE(BatchCellPreAct, output);
+  NEW_NODE(lstm, BatchCellPreAct, output);
  lstm_op->LinksFrom({x, Weight, Bias});
  lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
  return Hidden;
 }
+PDNode* patterns::GRU(PDPattern* pattern, const std::string& name_scope,
+                      PDNode* x) {
+  x->assert_is_op_input("gru", "Input");
+  auto* gru_op = pattern->NewNode(name_scope, "gru")->assert_is_op("gru");
+  NEW_NODE(gru, Weight, input);
+  // TODO(Superjomn): upgrade the fuse framework to support optional.
+  // H0 and bias are optional
+  NEW_NODE(gru, Bias, input);  // also optional
+  // NEW_NODE(H0, input);
+  NEW_NODE(gru, Hidden, output);
+  // below are intermediate
+  NEW_NODE(gru, BatchGate, output);
+  NEW_NODE(gru, BatchResetHiddenPrev, output);
+  NEW_NODE(gru, BatchHidden, output);
+  BatchGate->AsIntermediate();
+  BatchResetHiddenPrev->AsIntermediate();
+  BatchHidden->AsIntermediate();
+  gru_op->LinksFrom({x, Weight, Bias});
+  gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden});
+  return Hidden;
+}
+#undef NEW_NODE
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -19,6 +19,9 @@
 #endif
 #include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
@@ -295,6 +298,8 @@ PDNode* FC(PDPattern* pattern, const std::string& name_scope, PDNode* x,
 PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);
+PDNode* GRU(PDPattern* pattern, const std::string& name_scope, PDNode* x);
 }  // namespace patterns
 #define IR_NODE_LINK_TO(a, b) \

--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -50,20 +50,37 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
  Dot dot;
-  std::vector<Dot::Attr> op_attrs({Dot::Attr("style", "filled"),
+  const std::vector<Dot::Attr> op_attrs({
-                                   Dot::Attr("shape", "box"),
+      Dot::Attr("style", "rounded,filled,bold"),  //
-                                   Dot::Attr("fillcolor", "red")});
+      Dot::Attr("shape", "box"),                  //
-  std::vector<Dot::Attr> var_attrs({Dot::Attr("style", "filled,rounded"),
+      Dot::Attr("color", "#303A3A"),              //
-                                    // Dot::Attr("shape", "diamond"),
+      Dot::Attr("fontcolor", "#ffffff"),          //
-                                    Dot::Attr("fillcolor", "yellow")});
+      Dot::Attr("width", "1.3"),                  //
+      Dot::Attr("height", "0.84"),                //
-  std::vector<Dot::Attr> marked_op_attrs({Dot::Attr("style", "filled"),
+      Dot::Attr("fontname", "Arial"),             //
-                                          Dot::Attr("shape", "box"),
+  });
-                                          Dot::Attr("fillcolor", "lightgray")});
+  const std::vector<Dot::Attr> arg_attrs({
-  std::vector<Dot::Attr> marked_var_attrs(
+      Dot::Attr("shape", "box"),                  //
-      {Dot::Attr("style", "filled,rounded"),
+      Dot::Attr("style", "rounded,filled,bold"),  //
-       // Dot::Attr("shape", "diamond"),
+      Dot::Attr("fontname", "Arial"),             //
-       Dot::Attr("fillcolor", "lightgray")});
+      Dot::Attr("fillcolor", "#999999"),          //
+      Dot::Attr("color", "#dddddd"),              //
+  });
+  const std::vector<Dot::Attr> param_attrs({
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("fontname", "Arial"),             //
+      Dot::Attr("color", "#148b97"),              //
+      Dot::Attr("fontcolor", "#ffffff"),          //
+  });
+  const std::vector<Dot::Attr> marked_op_attrs(
+      {Dot::Attr("style", "rounded,filled,bold"), Dot::Attr("shape", "box"),
+       Dot::Attr("fillcolor", "yellow")});
+  const std::vector<Dot::Attr> marked_var_attrs(
+      {Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"),
+       Dot::Attr("fillcolor", "yellow")});
  auto marked_nodes = ConsumeMarkedNodes(graph.get());
  // Create nodes
@@ -74,9 +91,17 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
          marked_nodes.count(n) ? marked_op_attrs : op_attrs;
      dot.AddNode(node_id, attr, node_id);
    } else if (n->IsVar()) {
-      decltype(op_attrs) attr =
+      decltype(op_attrs)* attr;
-          marked_nodes.count(n) ? marked_var_attrs : var_attrs;
+      if (marked_nodes.count(n)) {
-      dot.AddNode(node_id, attr, node_id);
+        attr = &marked_var_attrs;
+      } else if (const_cast<Node*>(n)->Var() &&
+                 const_cast<Node*>(n)->Var()->Persistable()) {
+        attr = &param_attrs;
+      } else {
+        attr = &arg_attrs;
+      }
+      dot.AddNode(node_id, *attr, node_id);
    }
    node2dot[n] = node_id;
  }

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -6,6 +6,7 @@ cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits
  analyzer.cc
  helper.cc
  # passes
+  analysis_pass.cc
  fluid_to_data_flow_graph_pass.cc
  data_flow_graph_to_fluid_pass.cc
  dfg_graphviz_draw_pass.cc
@@ -58,7 +59,7 @@ endif()
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
    ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
-        --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
+         --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
 inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
@@ -74,25 +75,42 @@ inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
 set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
 set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
 set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE)
-if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING)
+if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
  inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz")
  inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
 endif()
 inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
    ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
        --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
 set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
 set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
 set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE)
-if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING)
+if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz")
    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz")
 endif()
 inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
    ARGS --infer_model=${LAC_INSTALL_DIR}/model
        --infer_data=${LAC_INSTALL_DIR}/data.txt)
+set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
+set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
+set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE)
+if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
+  inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
+  inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz")
+endif()
+inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
+    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
+         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
+         --topn=1 # Just run top 1 batch.
+    )
--- a/paddle/fluid/inference/analysis/pass.cc
+++ b/paddle/fluid/inference/analysis/pass.cc
@@ -12,4 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -28,10 +28,10 @@ namespace paddle {
 namespace inference {
 namespace analysis {
-class Pass {
+class AnalysisPass {
 public:
-  Pass() = default;
+  AnalysisPass() = default;
-  virtual ~Pass() = default;
+  virtual ~AnalysisPass() = default;
  // Mutable Pass.
  virtual bool Initialize(Argument *argument) { return false; }
  // Readonly Pass.
@@ -42,23 +42,16 @@ class Pass {
  virtual bool Finalize() { return false; }
  // Get a Pass appropriate to print the Node this pass operates on.
-  virtual Pass *CreatePrinterPass(std::ostream &os,
+  virtual AnalysisPass *CreatePrinterPass(std::ostream &os,
-                                  const std::string &banner) const {
+                                          const std::string &banner) const {
    return nullptr;
  }
  // Create a debugger Pass that draw the DFG by graphviz toolkit.
-  virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
+  virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }
-  virtual void Run() { LOG(FATAL) << "not valid"; }
-  // Run on a single Node.
-  virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
-  // Run on a single Function.
-  virtual void Run(Function *x) { LOG(FATAL) << "not valid"; }
-  // Run on a single FunctionBlock.
-  virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
  // Run on a single DataFlowGraph.
-  virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
+  virtual void Run(DataFlowGraph *x) = 0;
  // Human-readable short representation.
  virtual std::string repr() const = 0;
@@ -66,29 +59,8 @@ class Pass {
  virtual std::string description() const { return "No DOC"; }
 };
-// NodePass process on any Node types.
-class NodePass : public Pass {
- public:
-  virtual void Run(Node *node) = 0;
-};
-// NodePass process on any Function node types.
-class FunctionPass : public Pass {
- public:
-  virtual void Run(Function *node) = 0;
-};
-// NodePass process on any FunctionBlock node types.
-class FunctionBlockPass : public Pass {
- public:
-  virtual void Run(FunctionBlock *node) = 0;
-};
 // GraphPass processes on any GraphType.
-class DataFlowGraphPass : public Pass {
+class DataFlowGraphPass : public AnalysisPass {};
- public:
-  virtual void Run(DataFlowGraph *graph) = 0;
-};
 }  // namespace analysis
 }  // namespace inference

--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -14,6 +14,8 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
+#include <vector>
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
@@ -41,27 +43,23 @@ class DfgPassManagerImpl final : public DfgPassManager {
 public:
  DfgPassManagerImpl() {
    // TODO(Superjomn) set the key with pass reprs.
-    LOG(INFO)
+    if (!FLAGS_IA_enable_ir) {
-        << "-----------------------------------------------------------------";
-    if (FLAGS_IA_enable_ir) {
-      AddPass("fluid-to-ir-pass", new FluidToIrPass);
-    } else {
      AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    } else {
+      AddPass("fluid-to-ir-pass", new FluidToIrPass);
    }
    TryAddTensorRtPass();
    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
    if (!FLAGS_IA_output_storage_path.empty()) {
      AddPass("model-store-pass", new ModelStorePass);
    }
-    LOG(INFO)
-        << "-----------------------------------------------------------------";
  }
  std::string repr() const override { return "dfg-pass-manager"; }
  std::string description() const override { return "DFG pass manager."; }
 private:
-  void AddPass(const std::string& name, Pass* pass) {
+  void AddPass(const std::string& name, AnalysisPass* pass) {
    VLOG(3) << "Adding pass " << name;
    Register(name, pass);
    AddGraphvizDebugerPass(pass);
@@ -90,7 +88,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
  }
  // Add the graphviz debuger pass if the parent pass has one.
-  void AddGraphvizDebugerPass(Pass* pass) {
+  void AddGraphvizDebugerPass(AnalysisPass* pass) {
    auto* debuger_pass = pass->CreateGraphvizDebugerPass();
    if (debuger_pass) {
      Register(debuger_pass->repr(), debuger_pass);
@@ -101,19 +99,15 @@ class DfgPassManagerImpl final : public DfgPassManager {
 Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
 void Analyzer::Run(Argument* argument) {
-  // Ugly support fluid-to-ir-pass
+  std::vector<std::string> passes;
-  argument->Set(kFluidToIrPassesAttr,
+  for (auto& pass : all_ir_passes_) {
-                new std::vector<std::string>({
+    if (!disabled_ir_passes_.count(pass)) {
-                    // Manual update the passes here.
+      passes.push_back(pass);
-                    "graph_viz_pass",                              //
+      passes.push_back("graph_viz_pass");  // add graphviz for debug.
-                    "infer_clean_graph_pass", "graph_viz_pass",    //
+    }
-                    "attention_lstm_fuse_pass", "graph_viz_pass",  //
+  }
-                    "fc_lstm_fuse_pass", "graph_viz_pass",         //
+  passes.push_back("graph_viz_pass");
-                    "mul_lstm_fuse_pass", "graph_viz_pass",        //
+  argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));
-                    "seq_concat_fc_fuse_pass", "graph_viz_pass",   //
-                    "fc_fuse_pass", "graph_viz_pass"               //
-                }));
  for (auto& x : data_) {
    PADDLE_ENFORCE(x->Initialize(argument));
@@ -122,6 +116,11 @@ void Analyzer::Run(Argument* argument) {
  }
 }
+Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
+  disabled_ir_passes_.insert(passes.begin(), passes.end());
+  return *this;
+}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -36,16 +36,12 @@ limitations under the License. */
 */
 #include <gflags/gflags.h>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
-// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
-// flag if not available.
-DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
-DECLARE_string(IA_graphviz_log_root);
-DECLARE_string(IA_output_storage_path);
-DECLARE_bool(IA_enable_ir);
 namespace paddle {
 namespace inference {
 namespace analysis {
@@ -57,7 +53,28 @@ class Analyzer : public OrderedRegistry<PassManager> {
  void Run(Argument* argument);
+  Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
  DISABLE_COPY_AND_ASSIGN(Analyzer);
+ private:
+  // All avaiable IR passes.
+  // The bigger fuse comes first, so that the small operators prefer to be
+  // merged in a larger fuse op. The small fusion will not break the pattern of
+  // larger fusion.
+  const std::vector<std::string> all_ir_passes_{{
+      // Manual update the passes here.
+      "infer_clean_graph_pass",    //
+      "attention_lstm_fuse_pass",  //
+      "fc_lstm_fuse_pass",         //
+      "mul_lstm_fuse_pass",        //
+      "fc_gru_fuse_pass",          //
+      "mul_gru_fuse_pass",         //
+      "seq_concat_fc_fuse_pass",   //
+      "fc_fuse_pass",              //
+  }};
+  std::unordered_set<std::string> disabled_ir_passes_;
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -11,13 +11,14 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/inference/analysis/analyzer.h"
-#include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/platform/profiler.h"
 DEFINE_string(infer_model, "", "model path for LAC");
@@ -102,6 +103,7 @@ struct DataRecord {
    return data;
  }
 };
 void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
                 int batch_size) {
  auto one_batch = data->NextBatch();
@@ -114,12 +116,7 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
  input_slots->assign({input_tensor});
 }
-static void PrintTime(const double latency, const int bs, const int repeat) {
-  LOG(INFO) << "===========profile result===========";
-  LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat
-            << ", avg latency: " << latency / repeat << "ms";
-  LOG(INFO) << "=====================================";
-}
 void BenchAllData(const std::string &model_path, const std::string &data_file,
                  const int batch_size, const int repeat) {
  NativeConfig config;
@@ -145,19 +142,18 @@ void BenchAllData(const std::string &model_path, const std::string &data_file,
      sum += timer.toc();
    }
  }
-  PrintTime(sum, batch_size, repeat);
+  PrintTime(batch_size, repeat, 1, 0, sum / repeat);
 }
 const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
                                25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
                                44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
                                14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
 void TestLACPrediction(const std::string &model_path,
                       const std::string &data_file, const int batch_size,
-                       const int repeat, bool test_all_data) {
+                       const int repeat, bool test_all_data,
-  if (test_all_data) {
+                       bool use_analysis = false) {
-    BenchAllData(model_path, data_file, batch_size, repeat);
-    return;
-  }
  NativeConfig config;
  config.model_dir = model_path;
  config.use_gpu = false;
@@ -166,17 +162,47 @@ void TestLACPrediction(const std::string &model_path,
  std::vector<PaddleTensor> input_slots, outputs_slots;
  DataRecord data(data_file, batch_size);
  GetOneBatch(&input_slots, &data, batch_size);
-  auto predictor =
+  std::unique_ptr<PaddlePredictor> predictor;
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  if (use_analysis) {
+    AnalysisConfig cfg;
+    cfg.model_dir = model_path;
+    cfg.use_gpu = false;
+    cfg.device = 0;
+    cfg.specify_input_name = true;
+    cfg.enable_ir_optim = true;
+    predictor =
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+  } else {
+    predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  }
  for (int i = 0; i < FLAGS_burning; i++) {
    predictor->Run(input_slots, &outputs_slots);
  }
  Timer timer;
+  if (test_all_data) {
+    double sum = 0;
+    LOG(INFO) << "Total number of samples: " << data.datasets.size();
+    for (int i = 0; i < repeat; i++) {
+      for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
+        GetOneBatch(&input_slots, &data, batch_size);
+        timer.tic();
+        predictor->Run(input_slots, &outputs_slots);
+        sum += timer.toc();
+      }
+    }
+    PrintTime(batch_size, repeat, 1, 0, sum / repeat);
+    LOG(INFO) << "Average latency of each sample: "
+              << sum / repeat / data.datasets.size() << " ms";
+    return;
+  }
  timer.tic();
  for (int i = 0; i < repeat; i++) {
    predictor->Run(input_slots, &outputs_slots);
  }
-  PrintTime(timer.toc(), batch_size, repeat);
+  PrintTime(batch_size, repeat, 1, 0, timer.toc() / repeat);
+  // check result
  EXPECT_EQ(outputs_slots.size(), 1UL);
  auto &out = outputs_slots[0];
  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
@@ -188,12 +214,60 @@ void TestLACPrediction(const std::string &model_path,
  for (size_t i = 0; i < batch1_size; ++i) {
    EXPECT_EQ(pdata[i], lac_ref_data[i]);
  }
+  if (use_analysis) {
+    // run once for comparion as reference
+    auto ref_predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+    std::vector<PaddleTensor> ref_outputs_slots;
+    ref_predictor->Run(input_slots, &ref_outputs_slots);
+    EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
+    auto &ref_out = ref_outputs_slots[0];
+    size_t ref_size =
+        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
+                        [](int a, int b) { return a * b; });
+    EXPECT_EQ(size, ref_size);
+    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
+    for (size_t i = 0; i < size; ++i) {
+      EXPECT_EQ(pdata_ref[i], pdata[i]);
+    }
+    AnalysisPredictor *analysis_predictor =
+        dynamic_cast<AnalysisPredictor *>(predictor.get());
+    auto &fuse_statis = analysis_predictor->analysis_argument()
+                            .Get<std::unordered_map<std::string, int>>(
+                                framework::ir::kFuseStatisAttr);
+    for (auto &item : fuse_statis) {
+      LOG(INFO) << "fused " << item.first << " " << item.second;
+    }
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
+    EXPECT_EQ(num_ops, 11);
+  }
 }
 TEST(Analyzer_LAC, native) {
  LOG(INFO) << "LAC with native";
  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
                    FLAGS_repeat, FLAGS_test_all_data);
 }
+TEST(Analyzer_LAC, analysis) {
+  LOG(INFO) << "LAC with analysis";
+  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
+                    FLAGS_repeat, FLAGS_test_all_data, true);
+}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
@@ -13,18 +13,19 @@
 // limitations under the License.
 #include "paddle/fluid/inference/analysis/analyzer.h"
-#include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/platform/profiler.h"
 DEFINE_string(infer_model, "", "model path");
 DEFINE_string(infer_data, "", "data path");
 DEFINE_int32(batch_size, 10, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
 namespace paddle {
 namespace inference {
@@ -35,6 +36,7 @@ struct DataRecord {
  std::vector<size_t> lod;  // two inputs have the same lod info.
  size_t batch_iter{0};
  size_t batch_size{1};
+  size_t num_samples;  // total number of samples
  DataRecord() = default;
  explicit DataRecord(const std::string &path, int batch_size = 1)
      : batch_size(batch_size) {
@@ -81,6 +83,7 @@ struct DataRecord {
      word_data_all.push_back(std::move(word_data));
      mention_data_all.push_back(std::move(mention_data));
    }
+    num_samples = num_lines;
  }
 };
@@ -109,7 +112,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
                                       48, 39, 38, 16, 25};
-void TestChineseNERPrediction() {
+void TestChineseNERPrediction(bool use_analysis) {
  NativeConfig config;
  config.prog_file = FLAGS_infer_model + "/__model__";
  config.param_file = FLAGS_infer_model + "/param";
@@ -117,24 +120,53 @@ void TestChineseNERPrediction() {
  config.device = 0;
  config.specify_input_name = true;
-  auto predictor =
+  std::vector<PaddleTensor> input_slots, outputs;
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  std::unique_ptr<PaddlePredictor> predictor;
-  std::vector<PaddleTensor> input_slots;
+  Timer timer;
-  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  if (use_analysis) {
+    AnalysisConfig cfg;
+    cfg.prog_file = FLAGS_infer_model + "/__model__";
+    cfg.param_file = FLAGS_infer_model + "/param";
+    cfg.use_gpu = false;
+    cfg.device = 0;
+    cfg.specify_input_name = true;
+    cfg.enable_ir_optim = true;
+    predictor =
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+  } else {
+    predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  }
+  if (FLAGS_test_all_data) {
+    LOG(INFO) << "test all data";
+    double sum = 0;
+    size_t num_samples;
+    for (int i = 0; i < FLAGS_repeat; i++) {
+      DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+      num_samples = data.num_samples;
+      for (size_t bid = 0; bid < num_samples; ++bid) {
+        PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+        timer.tic();
+        predictor->Run(input_slots, &outputs);
+        sum += timer.toc();
+      }
+    }
+    LOG(INFO) << "total number of samples: " << num_samples;
+    PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
+    LOG(INFO) << "average latency of each sample: "
+              << sum / FLAGS_repeat / num_samples;
+    return;
+  }
  // Prepare inputs.
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
  PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-  std::vector<PaddleTensor> outputs;
-  Timer timer;
  timer.tic();
  for (int i = 0; i < FLAGS_repeat; i++) {
    predictor->Run(input_slots, &outputs);
  }
-  LOG(INFO) << "===========profile result===========";
+  PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, timer.toc() / FLAGS_repeat);
-  LOG(INFO) << "batch_size: " << FLAGS_batch_size
-            << ", repeat: " << FLAGS_repeat
-            << ", latency: " << timer.toc() / FLAGS_repeat << "ms";
-  LOG(INFO) << "=====================================";
  PADDLE_ENFORCE(outputs.size(), 1UL);
  auto &out = outputs[0];
@@ -145,10 +177,51 @@ void TestChineseNERPrediction() {
  for (size_t i = 0; i < std::min(11UL, size); i++) {
    PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]);
  }
+  if (use_analysis) {
+    // run once for comparion as reference
+    auto ref_predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+    std::vector<PaddleTensor> ref_outputs_slots;
+    ref_predictor->Run(input_slots, &ref_outputs_slots);
+    EXPECT_EQ(ref_outputs_slots.size(), outputs.size());
+    auto &ref_out = ref_outputs_slots[0];
+    size_t ref_size =
+        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
+                        [](int a, int b) { return a * b; });
+    EXPECT_EQ(size, ref_size);
+    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
+    for (size_t i = 0; i < size; ++i) {
+      EXPECT_EQ(pdata_ref[i], result[i]);
+    }
+    AnalysisPredictor *analysis_predictor =
+        dynamic_cast<AnalysisPredictor *>(predictor.get());
+    auto &fuse_statis = analysis_predictor->analysis_argument()
+                            .Get<std::unordered_map<std::string, int>>(
+                                framework::ir::kFuseStatisAttr);
+    for (auto &item : fuse_statis) {
+      LOG(INFO) << "fused " << item.first << " " << item.second;
+    }
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
+    EXPECT_EQ(num_ops, 14);
+  }
 }
-// Directly infer with the original model.
+TEST(Analyzer_Chinese_ner, native) { TestChineseNERPrediction(false); }
-TEST(Analyzer, Chinese_ner) { TestChineseNERPrediction(); }
+TEST(Analyzer_Chinese_ner, analysis) { TestChineseNERPrediction(true); }
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -16,6 +16,7 @@
 #include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
+#include <thread>  // NOLINT
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
@@ -24,12 +25,12 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/platform/profiler.h"
 DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
 DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
 DEFINE_int32(batch_size, 10, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
 namespace paddle {
 namespace inference {
@@ -220,39 +221,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }
-std::string DescribeTensor(const PaddleTensor &tensor) {
-  std::stringstream os;
-  os << "Tensor [" << tensor.name << "]\n";
-  os << " - type: ";
-  switch (tensor.dtype) {
-    case PaddleDType::FLOAT32:
-      os << "float32";
-      break;
-    case PaddleDType::INT64:
-      os << "int64";
-      break;
-    default:
-      os << "unset";
-  }
-  os << '\n';
-  os << " - shape: " << to_string(tensor.shape) << '\n';
-  os << " - lod: ";
-  for (auto &l : tensor.lod) {
-    os << to_string(l) << "; ";
-  }
-  os << "\n";
-  os << " - data: ";
-  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
-                            [](int a, int b) { return a * b; });
-  for (int i = 0; i < dim; i++) {
-    os << static_cast<float *>(tensor.data.data())[i] << " ";
-  }
-  os << '\n';
-  return os.str();
-}
 }  // namespace
 const float ditu_rnn_target_data[] = {
@@ -266,55 +234,93 @@ const float ditu_rnn_target_data[] = {
    10.7286, 12.0595, 10.6672, 0,       0,       0,       0,       0,
    93.5771, 3.84641, 0,       0,       0,       0,       0,       0,
    169.426, 0,       0,       0,       0,       0,       0,       0};
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<PaddleTensor> &base_outputs) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &base_out = base_outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
+                                   1, [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_EQ(size, size1);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *data = static_cast<float *>(out.data.data());
+    float *base_data = static_cast<float *>(base_out.data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+    }
+  }
+}
 // Test with a really complicate model.
-void TestDituRNNPrediction(const std::string &model_path,
+void TestDituRNNPrediction(bool use_analysis, bool activate_ir,
-                           const std::string &data_path, int batch_size,
+                           int num_threads) {
-                           bool use_analysis, bool activate_ir,
+  AnalysisConfig config;
-                           int num_times = 1) {
-  NativeConfig config;
  config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
  config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
  config.use_gpu = false;
  config.device = 0;
  config.specify_input_name = true;
+  config.enable_ir_optim = activate_ir;
+  PADDLE_ENFORCE(config.ir_mode ==
+                 AnalysisConfig::IrPassMode::kExclude);  // default
+  config.ir_passes.clear();  // Do not exclude any pass.
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
  auto base_predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
  auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(config);
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
  std::vector<PaddleTensor> input_slots;
-  DataRecord data(data_path, batch_size);
+  DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
  // Prepare inputs.
  PrepareInputs(&input_slots, &data, batch_size);
  std::vector<PaddleTensor> outputs, base_outputs;
  base_predictor->Run(input_slots, &base_outputs);
-  Timer timer;
+  if (num_threads == 1) {
-  timer.tic();
+    // Prepare inputs.
-  for (int i = 0; i < num_times; i++) {
+    Timer timer;
-    predictor->Run(input_slots, &outputs);
+    timer.tic();
-  }
+    for (int i = 0; i < num_times; i++) {
-  LOG(INFO) << "===========profile result===========";
+      predictor->Run(input_slots, &outputs);
-  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
+    }
-            << ", latency: " << timer.toc() / num_times << "ms";
+    PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
-  LOG(INFO) << "=====================================";
+    CompareResult(outputs, base_outputs);
+  } else {
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
+    std::vector<std::thread> threads;
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+    std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-  for (size_t i = 0; i < outputs.size(); i++) {
+    // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
-    auto &out = outputs[i];
+    // because AttentionLSTM's hard code nodeid will be damanged.
-    auto &base_out = base_outputs[i];
+    for (int tid = 0; tid < num_threads; ++tid) {
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+      predictors.emplace_back(
-                                  [](int a, int b) { return a * b; });
+          CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
+              config));
-                                   1, [](int a, int b) { return a * b; });
+    }
-    PADDLE_ENFORCE_EQ(size, size1);
+    for (int tid = 0; tid < num_threads; ++tid) {
-    PADDLE_ENFORCE_GT(size, 0);
+      threads.emplace_back([&, tid]() {
-    float *data = static_cast<float *>(out.data.data());
+        // Each thread should have local input_slots and outputs.
-    float *base_data = static_cast<float *>(base_out.data.data());
+        std::vector<PaddleTensor> input_slots;
-    for (size_t j = 0; j < size; j++) {
+        DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
-      EXPECT_NEAR(data[j], base_data[j], 1e-3);
+        PrepareInputs(&input_slots, &data, batch_size);
+        std::vector<PaddleTensor> outputs;
+        Timer timer;
+        timer.tic();
+        for (int i = 0; i < num_times; i++) {
+          predictors[tid]->Run(input_slots, &outputs);
+        }
+        PrintTime(batch_size, num_times, num_threads, tid,
+                  timer.toc() / num_times);
+        CompareResult(outputs, base_outputs);
+      });
+    }
+    for (int i = 0; i < num_threads; ++i) {
+      threads[i].join();
    }
  }
@@ -345,25 +351,26 @@ void TestDituRNNPrediction(const std::string &model_path,
  }
 }
-// Directly infer with the original model.
+// Inference with analysis and IR, easy for profiling independently.
-TEST(Analyzer, DituRNN_without_analysis) {
+TEST(Analyzer, DituRNN) {
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
+  TestDituRNNPrediction(true, true, FLAGS_num_threads);
-                        FLAGS_batch_size, false, false, FLAGS_repeat);
 }
-// Inference with the original model with the analysis turned on, the analysis
+// Other unit-tests of DituRNN, test different options of use_analysis,
-// module will transform the program to a data flow graph.
+// activate_ir and multi-threads.
-TEST(Analyzer, DituRNN_with_analysis) {
+TEST(Analyzer, DituRNN_tests) {
-  LOG(INFO) << "ditu rnn with analysis";
+  int num_threads[2] = {1, 4};
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
+  for (auto i : num_threads) {
-                        FLAGS_batch_size, true, false, FLAGS_repeat);
+    // Directly infer with the original model.
-}
+    TestDituRNNPrediction(false, false, i);
+    // Inference with the original model with the analysis turned on, the
-// Inference with analysis and IR. The IR module will fuse some large kernels.
+    // analysis
-TEST(Analyzer, DituRNN_with_analysis_with_IR) {
+    // module will transform the program to a data flow graph.
-  LOG(INFO) << "ditu rnn with analysis and IR fuse";
+    TestDituRNNPrediction(true, false, i);
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
+    // Inference with analysis and IR. The IR module will fuse some large
-                        FLAGS_batch_size, true, true, FLAGS_repeat);
+    // kernels.
+    TestDituRNNPrediction(true, true, i);
+  }
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
+#include <gtest/gtest.h>
+#include <fstream>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/api/timer.h"
+DEFINE_string(infer_model, "", "Directory of the inference model.");
+DEFINE_string(infer_data, "", "Path of the dataset.");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(repeat, 1, "How many times to repeat run.");
+DEFINE_int32(topn, -1, "Run top n batches of data to save time");
+namespace paddle {
+namespace inference {
+struct DataReader {
+  explicit DataReader(const std::string &path)
+      : file(new std::ifstream(path)) {}
+  bool NextBatch(PaddleTensor *tensor, int batch_size) {
+    PADDLE_ENFORCE_EQ(batch_size, 1);
+    std::string line;
+    tensor->lod.clear();
+    tensor->lod.emplace_back(std::vector<size_t>({0}));
+    std::vector<int64_t> data;
+    for (int i = 0; i < batch_size; i++) {
+      if (!std::getline(*file, line)) return false;
+      inference::split_to_int64(line, ' ', &data);
+    }
+    tensor->lod.front().push_back(data.size());
+    tensor->data.Resize(data.size() * sizeof(int64_t));
+    memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t));
+    tensor->shape.clear();
+    tensor->shape.push_back(data.size());
+    tensor->shape.push_back(1);
+    return true;
+  }
+  std::unique_ptr<std::ifstream> file;
+};
+void Main(int batch_size) {
+  // shape --
+  // Create Predictor --
+  AnalysisConfig config;
+  config.model_dir = FLAGS_infer_model;
+  config.use_gpu = false;
+  config.enable_ir_optim = true;
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  std::vector<PaddleTensor> input_slots(1);
+  // one batch starts
+  // data --
+  auto &input = input_slots[0];
+  input.dtype = PaddleDType::INT64;
+  inference::Timer timer;
+  double sum = 0;
+  std::vector<PaddleTensor> output_slots;
+  int num_batches = 0;
+  for (int t = 0; t < FLAGS_repeat; t++) {
+    DataReader reader(FLAGS_infer_data);
+    while (reader.NextBatch(&input, FLAGS_batch_size)) {
+      if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break;
+      timer.tic();
+      CHECK(predictor->Run(input_slots, &output_slots));
+      sum += timer.toc();
+      ++num_batches;
+    }
+  }
+  PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
+  // Get output
+  LOG(INFO) << "get outputs " << output_slots.size();
+  for (auto &output : output_slots) {
+    LOG(INFO) << "output.shape: " << to_string(output.shape);
+    // no lod ?
+    CHECK_EQ(output.lod.size(), 0UL);
+    LOG(INFO) << "output.dtype: " << output.dtype;
+    std::stringstream ss;
+    for (int i = 0; i < 5; i++) {
+      ss << static_cast<float *>(output.data.data())[i] << " ";
+    }
+    LOG(INFO) << "output.data summary: " << ss.str();
+    // one batch ends
+  }
+}
+TEST(text_classification, basic) { Main(FLAGS_batch_size); }
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -263,7 +263,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }  // namespace
-Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
      FLAGS_IA_graphviz_log_root,
      "data_flow_graph_to_fluid_graphviz_debugger"));

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -21,8 +21,8 @@
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 namespace paddle {
 namespace inference {
@@ -42,7 +42,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
    return "Transform a DFG to a Fluid ProgramDesc";
  }
-  Pass *CreateGraphvizDebugerPass() const override;
+  AnalysisPass *CreateGraphvizDebugerPass() const override;
 protected:
  // Add a Fluid Op into the ProgramDesc.

--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include <fstream>
 #include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/dot.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/analysis/flags.h
+++ b/paddle/fluid/inference/analysis/flags.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
+// flag if not available.
+DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
+DECLARE_string(IA_graphviz_log_root);
+DECLARE_string(IA_output_storage_path);
+DECLARE_bool(IA_enable_ir);
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -66,7 +66,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }
-Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
      FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
 }

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -22,8 +22,8 @@
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 namespace paddle {
 namespace inference {
@@ -46,7 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
    return "transform a fluid ProgramDesc to a data flow graph.";
  }
-  Pass *CreateGraphvizDebugerPass() const override;
+  AnalysisPass *CreateGraphvizDebugerPass() const override;
 private:
  framework::proto::ProgramDesc const *desc_;

--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
@@ -14,14 +14,17 @@
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
-using namespace framework;
 static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__";
@@ -47,7 +50,8 @@ class FluidToIrPass final : public DataFlowGraphPass {
    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
    // Load program.
    auto program = LoadProgramDesc(*argument->fluid_model_program_path);
-    argument->origin_program_desc.reset(new proto::ProgramDesc(program));
+    argument->origin_program_desc.reset(
+        new framework::proto::ProgramDesc(program));
    // Create main data flow graph.
    if (!argument->main_dfg) {
      argument->main_dfg.reset(new DataFlowGraph);
@@ -77,27 +81,30 @@ class FluidToIrPass final : public DataFlowGraphPass {
    IRPassManager ir_passes(argument_->Get<ProgramDesc>("ir_program_desc"),
                            nullptr);
    // Pass the scope from analysis to IR if needed.
-    if (argument_->Has(ir::kParamScopeAttr)) {
+    if (argument_->Has(framework::ir::kParamScopeAttr)) {
      // Here the address is passed, attention that IR doesn't own the scope, so
      // the real scope in analysis should live during the IR phase.
      ir_passes.graph().Set(
-          ir::kParamScopeAttr,
+          framework::ir::kParamScopeAttr,
-          new Scope *(&argument_->Get<Scope>(ir::kParamScopeAttr)));
+          new framework::Scope *(&argument_->Get<framework::Scope>(
+              framework::ir::kParamScopeAttr)));
    }
-    const auto &ir_passes_to_apply =
+    if (FLAGS_IA_enable_ir) {
-        argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
+      const auto &ir_passes_to_apply =
-    ir_passes.Apply(ir_passes_to_apply);
+          argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
+      ir_passes.Apply(ir_passes_to_apply);
+    }
    PADDLE_ENFORCE(argument_->main_dfg.get());
    argument_->main_dfg->Build(ir_passes.graph());
    // inherit the arguments from ir.
-    if (ir_passes.graph().Has(ir::kFuseStatisAttr)) {
+    if (ir_passes.graph().Has(framework::ir::kFuseStatisAttr)) {
      argument_->Set(
-          ir::kFuseStatisAttr,
+          framework::ir::kFuseStatisAttr,
          new std::unordered_map<std::string, int>(
              ir_passes.graph().Get<std::unordered_map<std::string, int>>(
-                  ir::kFuseStatisAttr)));
+                  framework::ir::kFuseStatisAttr)));
    }
  }
@@ -109,7 +116,7 @@ class FluidToIrPass final : public DataFlowGraphPass {
 private:
  // Load parameters from a single file or from a directory.
-  bool LoadParams(Scope *scope, const std::string &dir,
+  bool LoadParams(framework::Scope *scope, const std::string &dir,
                  const std::string &prog_file, const std::string &param_file);
 private:

--- a/paddle/fluid/inference/analysis/model_store_pass.h
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -19,7 +19,7 @@
 #pragma once
 #include <string>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -40,17 +40,6 @@ void DfgPassManager::RunAll() {
  }
 }
-void NodePassManager::RunAll() {
-  PADDLE_ENFORCE(argument_);
-  PADDLE_ENFORCE(argument_->main_dfg.get());
-  auto trait = GraphTraits<DataFlowGraph>(*argument_->main_dfg).nodes_in_DFS();
-  for (auto& node : trait) {
-    for (auto& pass : data_) {
-      pass->Run(&node);
-    }
-  }
-}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ b/paddle/fluid/inference/analysis/pass_manager.h
@@ -33,7 +33,7 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 namespace paddle {
 namespace inference {
@@ -43,7 +43,7 @@ namespace analysis {
 * PassManager is the base class for all pass managers, a pass manager has
 * several Pass-es registered, and execute them in the linear order.
 */
-class PassManager : public OrderedRegistry<Pass> {
+class PassManager : public OrderedRegistry<AnalysisPass> {
 public:
  PassManager() = default;
  // Call all the passes' Initialize methods. The desc and data_flow_graph are
@@ -89,18 +89,6 @@ class DfgPassManager : public PassManager {
  virtual ~DfgPassManager() = default;
 };
-/*
- * A pass manager that process a Node each time.
- */
-class NodePassManager : public PassManager {
- public:
-  NodePassManager() = default;
-  void RunAll() override;
-  virtual ~NodePassManager() = default;
-};
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -34,28 +34,6 @@ class TestDfgPassManager final : public DfgPassManager {
  std::string description() const override { return "test doc"; }
 };
-class TestNodePassManager final : public NodePassManager {
- public:
-  virtual ~TestNodePassManager() = default;
-  std::string repr() const override { return "test-node-pass-manager"; }
-  std::string description() const override { return "test doc"; }
-};
-class TestNodePass final : public NodePass {
- public:
-  virtual ~TestNodePass() = default;
-  bool Initialize(Argument* argument) override { return true; }
-  void Run(Node* node) override {
-    LOG(INFO) << "- Processing node " << node->repr();
-  }
-  std::string repr() const override { return "test-node"; }
-  std::string description() const override { return "some doc"; }
-};
 TEST(PassManager, DFG_pass_manager) {
  TestDfgPassManager manager;
  DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
@@ -71,19 +49,6 @@ TEST(PassManager, DFG_pass_manager) {
  manager.RunAll();
 }
-TEST(PassManager, Node_pass_manager) {
-  Argument argument(FLAGS_inference_model_dir);
-  // Pre-process: initialize the DFG with the ProgramDesc first.
-  FluidToDataFlowGraphPass pass0;
-  pass0.Initialize(&argument);
-  pass0.Run(argument.main_dfg.get());
-  TestNodePassManager manager;
-  manager.Register("test-node-pass", new TestNodePass);
-  ASSERT_TRUE(manager.Initialize(&argument));
-  manager.RunAll();
-}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -68,7 +68,7 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass {
  }
 };
-Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
  DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
                                      "tensorrt_marked_node");
  return new DfgDebuggerPass(config);

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
@@ -20,7 +20,7 @@
 #pragma once
 #include <string>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
 namespace paddle {
@@ -48,7 +48,7 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
    return "tensorrt sub-graph mark pass";
  }
-  Pass* CreateGraphvizDebugerPass() const override;
+  AnalysisPass* CreateGraphvizDebugerPass() const override;
  bool Finalize() override;
 private:

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/node.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
 namespace paddle {

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -44,8 +44,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
 cc_test(test_paddle_inference_api
        SRCS api_tester.cc
        DEPS paddle_inference_api)
@@ -61,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
 inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()
-if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
+if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
    # compile the libinference_anakin_api.a and anakin.so.
    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
@@ -71,12 +70,24 @@ if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
    anakin_target(inference_anakin_api)
    anakin_target(inference_anakin_api_shared)
    if (WITH_TESTING)
-        cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
+        # TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
-                ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
+        set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
-                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+        set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
+        set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
+        execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
+        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
+        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
+        if(WITH_GPU)
+            set(anakin_test_extra_deps dynload_cuda)
+            set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
+            execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
+            cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
+                    ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
+                    DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
+        endif()
        cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc 
                ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
                     --datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
-                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+                DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
    endif(WITH_TESTING)
 endif()
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -14,24 +14,40 @@
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/profiler.h"
+DECLARE_bool(profile);
 namespace paddle {
 bool AnalysisPredictor::Init(
    const std::shared_ptr<framework::Scope>& parent_scope) {
  VLOG(3) << "Predictor::init()";
+#if !defined(_WIN32)
+  if (FLAGS_profile) {
+    LOG(WARNING) << "Profiler is actived, might affect the performance";
+    LOG(INFO) << "You can turn off by set gflags '-profile false'";
+    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
+                                           : platform::ProfilerState::kCPU;
+    platform::EnableProfiler(tracking_device);
+  }
+#endif
  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
+    LOG(WARNING) << "ir optimize only supports CPU currently";
+    config_.enable_ir_optim = false;
  } else {
    place_ = paddle::platform::CPUPlace();
  }
-  PADDLE_ENFORCE(!parent_scope);
  if (parent_scope) {
    scope_ = parent_scope;
    sub_scope_ = &(parent_scope->NewScope());
@@ -73,7 +89,7 @@ bool AnalysisPredictor::Init(
 void AnalysisPredictor::OptimizeInferenceProgram() {
  LOG(INFO) << "optimize begin";
-  FLAGS_IA_enable_ir = true;
+  FLAGS_IA_enable_ir = config_.enable_ir_optim;
  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
  FLAGS_IA_output_storage_path = "";  // Don't output the model.
  // Analyze inference_program
@@ -90,24 +106,26 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  }
  argument_.origin_program_desc.reset(
      new ProgramDesc(*inference_program_->Proto()));
-  Analyzer().Run(&argument_);
+  PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude,
+                 "Only kExclude is supported yet.");
+  Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
  CHECK(argument_.transformed_program_desc);
  VLOG(5) << "to prepare executor";
-  // LOG(INFO) << "transformed_parogram_desc " <<
-  // argument.transformed_program_desc->DebugString();
  inference_program_.reset(
      new framework::ProgramDesc(*argument_.transformed_program_desc));
-  PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr));
+  if (argument_.Has(framework::ir::kParamScopeAttr)) {
-  // Update scope.
+    // Update scope.
-  scope_.reset(
+    scope_.reset(
-      argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
+        argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
-  LOG(INFO) << "optimize end ==";
+  }
+  LOG(INFO) << "== optimize end ==";
 }
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) {
+    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) {
-  VLOG(3) << "create NativePredictor";
+  VLOG(3) << "create AnalysisConfig";
  if (config.use_gpu) {
    // 1. GPU memeroy
    PADDLE_ENFORCE_GT(

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <string>
+#include <vector>
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -28,7 +30,7 @@ using framework::proto::ProgramDesc;
 */
 class AnalysisPredictor : public NativePaddlePredictor {
 public:
-  explicit AnalysisPredictor(const NativeConfig& config)
+  explicit AnalysisPredictor(const AnalysisConfig& config)
      : NativePaddlePredictor(config), config_(config) {}
  bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
@@ -44,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor {
  Argument& analysis_argument() { return argument_; }
 private:
-  NativeConfig config_;
+  AnalysisConfig config_;
  Argument argument_;
 };

--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -193,7 +193,9 @@ PaddleInferenceAnakinPredictor<Target>::Clone() {
  return std::move(cls);
 }
+#ifdef PADDLE_WITH_CUDA
 template class PaddleInferenceAnakinPredictor<anakin::NV>;
+#endif
 template class PaddleInferenceAnakinPredictor<anakin::X86>;
 // A factory to help create difference predictor.
@@ -202,10 +204,15 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
    AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
  VLOG(3) << "Anakin Predictor create.";
  if (config.target_type == AnakinConfig::NVGPU) {
+#ifdef PADDLE_WITH_CUDA
    VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
    std::unique_ptr<PaddlePredictor> x(
        new PaddleInferenceAnakinPredictor<anakin::NV>(config));
    return x;
+#else
+    LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment";
+    return nullptr;
+#endif
  } else if (config.target_type == AnakinConfig::X86) {
    VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
    std::unique_ptr<PaddlePredictor> x(

--- a/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
@@ -20,71 +20,16 @@ limitations under the License. */
 #include <iostream>
 #include <thread>  // NOLINT
 #include <vector>
-#include "framework/core/net/net.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/timer.h"
+#include "utils/logger/logger.h"
 DEFINE_string(model, "", "Directory of the inference model.");
 DEFINE_string(datapath, "", "Path of the dataset.");
 DEFINE_int32(batch_size, 1, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-// Timer for timer
-class Timer {
- public:
-  double start;
-  double startu;
-  void tic() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    start = tp.tv_sec;
-    startu = tp.tv_usec;
-  }
-  double toc() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    double used_time_ms =
-        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
-    return used_time_ms;
-  }
-};
-std::vector<std::string> string_split(std::string in_str,
-                                      std::string delimiter) {
-  std::vector<std::string> seq;
-  int found = in_str.find(delimiter);
-  int pre_found = -1;
-  while (found != std::string::npos) {
-    if (pre_found == -1) {
-      seq.push_back(in_str.substr(0, found));
-    } else {
-      seq.push_back(in_str.substr(pre_found + delimiter.length(),
-                                  found - delimiter.length() - pre_found));
-    }
-    pre_found = found;
-    found = in_str.find(delimiter, pre_found + delimiter.length());
-  }
-  seq.push_back(
-      in_str.substr(pre_found + 1, in_str.length() - (pre_found + 1)));
-  return seq;
-}
-std::vector<std::string> string_split(
-    std::string in_str, std::vector<std::string>& delimiter) {  // NOLINT
-  std::vector<std::string> in;
-  std::vector<std::string> out;
-  out.push_back(in_str);
-  for (auto del : delimiter) {
-    in = out;
-    out.clear();
-    for (auto s : in) {
-      auto out_s = string_split(s, del);
-      for (auto o : out_s) {
-        out.push_back(o);
-      }
-    }
-  }
-  return out;
-}
 class Data {
 public:
  Data(std::string file_name, int batch_size)
@@ -120,36 +65,24 @@ void Data::get_batch_data(
  week_fea.clear();
  time_fea.clear();
  while (_file.getline(buf, 10000)) {
-    std::string s = buf;
+    std::vector<std::string> data_vec;
-    std::vector<std::string> deli_vec = {":"};
+    paddle::inference::split(buf, ':', &data_vec);
-    std::vector<std::string> data_vec = string_split(s, deli_vec);
    std::vector<std::string> seq;
-    seq = string_split(data_vec[0], {"|"});
+    paddle::inference::split(data_vec[0], '|', &seq);
    for (auto link : seq) {
-      std::vector<std::string> data = string_split(link, ",");
      std::vector<float> vec;
-      for (int i = 0; i < data.size(); i++) {
+      paddle::inference::split_to_float(link, ',', &vec);
-        vec.push_back(atof(data[i].c_str()));
-      }
      fea.push_back(vec);
    }
-    std::vector<std::string> week_data;
-    std::vector<std::string> time_data;
-    week_data = string_split(data_vec[2], ",");
    std::vector<float> vec_w;
-    for (int i = 0; i < week_data.size(); i++) {
+    paddle::inference::split_to_float(data_vec[2], ',', &vec_w);
-      vec_w.push_back(atof(week_data[i].c_str()));
-    }
    week_fea.push_back(vec_w);
-    time_data = string_split(data_vec[1], ",");
    std::vector<float> vec_t;
-    for (int i = 0; i < time_data.size(); i++) {
+    paddle::inference::split_to_float(data_vec[1], ',', &vec_t);
-      vec_t.push_back(atof(time_data[i].c_str()));
-    }
    time_fea.push_back(vec_t);
    cum += seq.size();
@@ -275,14 +208,13 @@ void single_test() {
    inputs.push_back(tensor_2);
    inputs.push_back(tensor_0);
-    Timer timer;
+    paddle::inference::Timer timer;
    timer.tic();
    for (int i = 0; i < FLAGS_repeat; i++) predictor->Run(inputs, &outputs);
-    LOG(INFO) << "batch_size = " << FLAGS_batch_size
+    paddle::inference::PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0,
-              << ", repeat = " << FLAGS_repeat
+                                 timer.toc() / FLAGS_repeat);
-              << ", sequence_length = " << seq_offset[seq_offset.size() - 1]
+    LOG(INFO) << "sequence_length = " << seq_offset[seq_offset.size() - 1];
-              << ", latency: " << timer.toc() / FLAGS_repeat << "ms";
    float* data_o = static_cast<float*>(outputs[0].data.data());
    VLOG(3) << "outputs[0].data.length() = " << outputs[0].data.length();

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -176,7 +176,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                    framework::Scope *scope) {
  VLOG(3) << "Predictor::set_feed";
  if (inputs.size() != feeds_.size()) {
-    LOG(ERROR) << "wrong feed input size.";
+    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
+               << inputs.size();
    return false;
  }
  for (size_t i = 0; i < inputs.size(); ++i) {

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <glog/logging.h>
 #include <sys/time.h>
 #include <algorithm>
 #include <numeric>
@@ -88,5 +89,45 @@ static void TensorAssignData(PaddleTensor *tensor,
  }
 }
+std::string DescribeTensor(const PaddleTensor &tensor) {
+  std::stringstream os;
+  os << "Tensor [" << tensor.name << "]\n";
+  os << " - type: ";
+  switch (tensor.dtype) {
+    case PaddleDType::FLOAT32:
+      os << "float32";
+      break;
+    case PaddleDType::INT64:
+      os << "int64";
+      break;
+    default:
+      os << "unset";
+  }
+  os << '\n';
+  os << " - shape: " << to_string(tensor.shape) << '\n';
+  os << " - lod: ";
+  for (auto &l : tensor.lod) {
+    os << to_string(l) << "; ";
+  }
+  os << "\n";
+  os << " - data: ";
+  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  for (int i = 0; i < dim; i++) {
+    os << static_cast<float *>(tensor.data.data())[i] << " ";
+  }
+  os << '\n';
+  return os.str();
+}
+void PrintTime(int batch_size, int repeat, int num_threads, int tid,
+               double latency) {
+  LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
+            << ", threads: " << num_threads << ", thread id: " << tid
+            << ", latency: " << latency << "ms ======";
+}
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -150,6 +150,21 @@ struct TensorRTConfig : public NativeConfig {
  int workspace_size{1 << 30};
 };
+// NOTE WIP, not stable yet.
+struct AnalysisConfig : public NativeConfig {
+  //
+  enum class IrPassMode {
+    kSystem,   // Use system default passes, not customize.
+    kInclude,  // Specify the passes in `ir_passes`.
+    kExclude   // Specify the disabled passes in `ir_passes`.
+  };
+  bool enable_ir_optim = true;
+  IrPassMode ir_mode{IrPassMode::kExclude};
+  // attention lstm fuse works only on some specific models, disable as default.
+  std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
+};
 // A factory to help create different predictors.
 //
 // FOR EXTENSION DEVELOPER:

--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/auc_op.h"
-#include <string>
 namespace paddle {
 namespace operators {
@@ -36,15 +35,12 @@ class AucOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(predict_height, label_height,
                      "Out and Label should have same height.");
-    int num_thres = ctx->Attrs().Get<int>("num_thresholds");
+    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
    ctx->SetOutputDim("AUC", {1});
-    ctx->SetOutputDim("TPOut", {num_thres});
+    ctx->SetOutputDim("BatchAUC", {1});
-    ctx->SetOutputDim("TNOut", {num_thres});
+    ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
-    ctx->SetOutputDim("FPOut", {num_thres});
+    ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
-    ctx->SetOutputDim("FNOut", {num_thres});
-    ctx->ShareLoD("Predict", /*->*/ "AUC");
  }
 protected:
@@ -66,25 +62,24 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label",
             "A 2D int tensor indicating the label of the training data. "
             "shape: [batch_size, 1]");
-    AddInput("TP", "True-Positive value.");
-    AddInput("FP", "False-Positive value.");
-    AddInput("TN", "True-Negative value.");
-    AddInput("FN", "False-Negative value.");
    // TODO(typhoonzero): support weight input
+    AddInput("StatPos", "Statistic value when label = 1");
+    AddInput("StatNeg", "Statistic value when label = 0");
    AddOutput("AUC",
              "A scalar representing the "
              "current area-under-the-curve.");
-    AddOutput("TPOut", "True-Positive value.");
+    AddOutput("BatchAUC", "The AUC for current batch");
-    AddOutput("FPOut", "False-Positive value.");
+    AddOutput("StatPosOut", "Statistic value when label = 1");
-    AddOutput("TNOut", "True-Negative value.");
+    AddOutput("StatNegOut", "Statistic value when label = 0");
-    AddOutput("FNOut", "False-Negative value.");
    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
        .SetDefault("ROC");
    AddAttr<int>("num_thresholds",
                 "The number of thresholds to use when discretizing the"
                 " roc curve.")
-        .SetDefault(200);
+        .SetDefault((2 << 12) - 1);
    AddComment(R"DOC(
 Area Under The Curve (AUC) Operator.

--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
@@ -23,106 +23,85 @@ namespace operators {
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto* predict = ctx.Input<Tensor>("Predict");
+    auto *predict = ctx.Input<Tensor>("Predict");
-    auto* label = ctx.Input<Tensor>("Label");
+    auto *label = ctx.Input<Tensor>("Label");
-    auto* auc = ctx.Output<Tensor>("AUC");
+    std::string curve = ctx.Attr<std::string>("curve");
+    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    int num_pred_buckets = num_thresholds + 1;
    // Only use output var for now, make sure it's persistable and
    // not cleaned up for each batch.
-    auto* true_positive = ctx.Output<Tensor>("TPOut");
+    auto *auc = ctx.Output<Tensor>("AUC");
-    auto* false_positive = ctx.Output<Tensor>("FPOut");
+    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
-    auto* true_negative = ctx.Output<Tensor>("TNOut");
+    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
-    auto* false_negative = ctx.Output<Tensor>("FNOut");
-    auto* auc_data = auc->mutable_data<double>(ctx.GetPlace());
+    auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
+    auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
+    calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
+            auc);
-    std::string curve = ctx.Attr<std::string>("curve");
+    auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
-    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
-    std::vector<double> thresholds_list;
+    std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
-    thresholds_list.reserve(num_thresholds);
+    calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
-    for (int i = 1; i < num_thresholds - 1; i++) {
+            num_thresholds, batch_auc);
-      thresholds_list[i] = static_cast<double>(i) / (num_thresholds - 1);
+  }
-    }
-    const double kEpsilon = 1e-7;
-    thresholds_list[0] = 0.0f - kEpsilon;
-    thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
+ private:
+  inline static double trapezoidArea(double X1, double X2, double Y1,
+                                     double Y2) {
+    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+  }
+  inline static void calcAuc(const framework::ExecutionContext &ctx,
+                             const framework::Tensor *label,
+                             const framework::Tensor *predict,
+                             int64_t *stat_pos, int64_t *stat_neg,
+                             int num_thresholds,
+                             framework::Tensor *auc_tensor) {
    size_t batch_size = predict->dims()[0];
    size_t inference_width = predict->dims()[1];
+    const T *inference_data = predict->data<T>();
+    const auto *label_data = label->data<int64_t>();
+    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
-    const T* inference_data = predict->data<T>();
+    for (size_t i = 0; i < batch_size; i++) {
-    const auto* label_data = label->data<int64_t>();
+      uint32_t binIdx = static_cast<uint32_t>(
+          inference_data[i * inference_width + 1] * num_thresholds);
-    auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
+      if (label_data[i]) {
-    auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
+        stat_pos[binIdx] += 1.0;
-    auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
+      } else {
-    auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace());
+        stat_neg[binIdx] += 1.0;
-    for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
-      // calculate TP, FN, TN, FP for current thresh
-      int64_t tp = 0, fn = 0, tn = 0, fp = 0;
-      for (size_t i = 0; i < batch_size; i++) {
-        // NOTE: label_data used as bool, labels > 0 will be treated as true.
-        if (label_data[i]) {
-          if (inference_data[i * inference_width + 1] >=
-              (thresholds_list[idx_thresh])) {
-            tp++;
-          } else {
-            fn++;
-          }
-        } else {
-          if (inference_data[i * inference_width + 1] >=
-              (thresholds_list[idx_thresh])) {
-            fp++;
-          } else {
-            tn++;
-          }
-        }
      }
-      // store rates
-      tp_data[idx_thresh] += tp;
-      fn_data[idx_thresh] += fn;
-      tn_data[idx_thresh] += tn;
-      fp_data[idx_thresh] += fp;
    }
-    // epsilon to avoid divide by zero.
-    double epsilon = 1e-6;
+    *auc = 0.0f;
-    // Riemann sum to caculate auc.
-    Tensor tp_rate, fp_rate, rec_rate;
+    double totPos = 0.0;
-    tp_rate.Resize({num_thresholds});
+    double totNeg = 0.0;
-    fp_rate.Resize({num_thresholds});
+    double totPosPrev = 0.0;
-    rec_rate.Resize({num_thresholds});
+    double totNegPrev = 0.0;
-    auto* tp_rate_data = tp_rate.mutable_data<double>(ctx.GetPlace());
-    auto* fp_rate_data = fp_rate.mutable_data<double>(ctx.GetPlace());
+    int idx = num_thresholds;
-    auto* rec_rate_data = rec_rate.mutable_data<double>(ctx.GetPlace());
-    for (int i = 0; i < num_thresholds; i++) {
+    while (idx >= 0) {
-      tp_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
+      totPosPrev = totPos;
-                        (tp_data[i] + fn_data[i] + epsilon);
+      totNegPrev = totNeg;
-      fp_rate_data[i] =
+      totPos += stat_pos[idx];
-          static_cast<double>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
+      totNeg += stat_neg[idx];
-      rec_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
+      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-                         (tp_data[i] + fp_data[i] + epsilon);
+      --idx;
    }
-    *auc_data = 0.0f;
-    if (curve == "ROC") {
+    if (totPos > 0.0 && totNeg > 0.0) {
-      for (int i = 0; i < num_thresholds - 1; i++) {
+      *auc = *auc / totPos / totNeg;
-        auto dx = fp_rate_data[i] - fp_rate_data[i + 1];
-        auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
-      }
-    } else if (curve == "PR") {
-      for (int i = 1; i < num_thresholds; i++) {
-        auto dx = tp_rate_data[i] - tp_rate_data[i - 1];
-        auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
-      }
    }
  }
 };

--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -119,7 +119,8 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
                  const framework::Tensor& last_scale,
                  const framework::Tensor& iter, const int window_size,
                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
-    auto& gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);

--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -157,6 +157,116 @@ class FlattenGradOp : public framework::OperatorBase {
  }
 };
+// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten,
+// the XShape is used to carry the shape and lod of X which will be used in
+// flatten_grad, in this way, the framework can reuse the memory of X
+// immediately the flatten2_op is finished.
+// Considering compatibility issues, we could not fix flatten2_op
+class Flatten2OpInferShape : public FlattenOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    FlattenOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output (XShape) of Flatten op should not be null.");
+    const auto &in_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      xshape_dims[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", "XShape");
+  }
+};
+class Flatten2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axis = Attr<int>("axis");
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
+    framework::AttributeMap attrs;
+    attrs["shape"] = out_dims;
+    attrs["inplace"] = false;
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+class Flatten2OpMaker : public FlattenOpMaker {
+ public:
+  void Make() override {
+    FlattenOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+class Flatten2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("flatten2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+class Flatten2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+class Flatten2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    attrs["inplace"] = false;
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -167,3 +277,8 @@ REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
                  ops::FlattenOpInferShape,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
+REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
+                  ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker);
+REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
+                  ops::Flatten2GradInferShape);
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -30,14 +30,7 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                 "Input(WeightX) of GRU should not be null.");
  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
                 "Input(WeightH) of GRU should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                 "Output(ReorderedH0) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                 "Output(BatchedInput) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
-                 "Output(BatchedOut) of GRU should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                 "Output(Hidden) of GRU should not be null.");
@@ -80,15 +73,20 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
  }
  framework::DDim out_dims({x_dims[0], frame_size});
  ctx->SetOutputDim("Hidden", out_dims);
-  ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchedOut", out_dims);
  ctx->ShareLoD("X", "Hidden");
  int xx_width;
  if (ctx->Attrs().Get<bool>("use_seq")) {
    xx_width = wx_dims[1];
  } else {
    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                   "Output(ReorderedH0) of GRU should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                   "Output(BatchedInput) of GRU should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
+                   "Output(BatchedOut) of GRU should not be null.");
+    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedOut", out_dims);
  }
  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
  ctx->ShareLoD("X", "XX");

--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -38,16 +38,6 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                 "Output(Hidden) of LSTM should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
                 "Output(Cell) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                 "Output(BatchedInput) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
-                 "Output(BatchedHidden) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
-                 "Output(BatchedCell) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                 "Output(ReorderedH0) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
-                 "Output(ReorderedC0) of LSTM should not be null.");
  auto x_dims = ctx->GetInputDim("X");
  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
@@ -88,28 +78,36 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
  PADDLE_ENFORCE_EQ(b_dims[0], 1,
                    "The first dimension of Input(Bias) should be 1.");
+  PADDLE_ENFORCE_EQ(
-  PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"),
+      b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
-                 "Do not support peephole yet.");
+      "The second dimension of Input(Bias) should be "
-  PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+      "7 * %d if enable peepholes connection or"
-                    "The second dimension of Input(Bias) should be "
+      "4 * %d if disable peepholes",
-                    "4 * %d if disable peepholes connection",
+      frame_size, frame_size);
-                    frame_size);
  framework::DDim out_dims({x_dims[0], frame_size});
  ctx->SetOutputDim("Hidden", out_dims);
  ctx->SetOutputDim("Cell", out_dims);
-  ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchedHidden", out_dims);
-  ctx->SetOutputDim("BatchedCell", out_dims);
  ctx->ShareLoD("X", "Hidden");
  ctx->ShareLoD("X", "Cell");
  int xx_width;
  if (ctx->Attrs().Get<bool>("use_seq")) {
    xx_width = wx_dims[1];
  } else {
    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                   "Output(BatchedInput) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
+                   "Output(BatchedHidden) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
+                   "Output(BatchedCell) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                   "Output(ReorderedH0) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
+                   "Output(ReorderedC0) of LSTM should not be null.");
+    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedHidden", out_dims);
+    ctx->SetOutputDim("BatchedCell", out_dims);
  }
  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
  ctx->ShareLoD("X", "XX");
@@ -242,7 +240,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
  auto* xx = ctx.Output<LoDTensor>("XX");             \
  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
  auto* cell_out = ctx.Output<LoDTensor>("Cell");     \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");
+  bool is_reverse = ctx.Attr<bool>("is_reverse");     \
+  bool use_peepholes = ctx.Attr<bool>("use_peepholes");
 #define INIT_BASE_SIZES                  \
  auto x_dims = x->dims();   /* T x M*/  \
@@ -253,99 +252,165 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
  const int D3 = D * 3;                  \
  const int D4 = wh_dims[1];
+#define INIT_BASE_INPUT_DATAS                                        \
+  const T* x_data = x->data<T>();                                    \
+  const T* wx_data = wx->data<T>();                                  \
+  const T* wh_data = wh->data<T>();                                  \
+  /* diagonal weight*/                                               \
+  const T* wc_data = bias->data<T>() + D4;                           \
+  /* for peephole only*/                                             \
+  Tensor checked_cell;                                               \
+  T* checked_cell_data = nullptr;                                    \
+  auto place = ctx.GetPlace();                                       \
+  if (use_peepholes) {                                               \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                 \
+    checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
+  }
+/// Compute LSTM
+#define GEMM_WH_ADDON(bs, prev, out)                                           \
+  blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
+            wh_data, D4, static_cast<T>(1), out, D4)
+// gates: W_ch, W_ih, W_fh, W_oh
+#define GET_Ct(ct_1, gates, ct)                   \
+  /* C_t = C_t-1 * fgated + cand_gated * igated*/ \
+  act_cand(D, gates, gates);                      \
+  blas.VMUL(D, gates, gates + D, gates + D);      \
+  blas.VMUL(D, ct_1, gates + D2, gates + D2);     \
+  blas.VADD(D, gates + D, gates + D2, ct)
+#define GET_Ht(ct, gates, ht)        \
+  /* H_t = act_cell(C_t) * ogated */ \
+  act_cell(D, ct, gates + D2);       \
+  blas.VMUL(D, gates + D2, gates + D3, ht)
+#define GET_Ct_NOH0C0(gates, ct)     \
+  /* C_t = igated * cgated*/         \
+  act_gate(D, gates + D, gates + D); \
+  act_cand(D, gates, gates);         \
+  blas.VMUL(D, gates, gates + D, ct)
+#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
+  GET_Ct_NOH0C0(gates, ct);                \
+  act_gate(D, gates + D3, gates + D3);     \
+  GET_Ht(ct, gates, ht)
+#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
+  GET_Ct_NOH0C0(gates, ct);                         \
+  /* get outgated, put W_oc * C_t on igated */      \
+  blas.VMUL(D, wc_data + D2, ct, gates + D);        \
+  blas.VADD(D, gates + D, gates + D3, gates + D3);  \
+  act_gate(D, gates + D3, gates + D3);              \
+  GET_Ht(ct, gates, ht)
+#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
+  act_gate(D3, gates + D, gates + D);     \
+  GET_Ct(ct_1, gates, ct);                \
+  GET_Ht(ct, gates, ht)
+#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht)        \
+  /* get fgated and igated*/                              \
+  blas.VMUL(D, wc_data, ct_1, checked_cell_data);         \
+  blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
+  blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
+  act_gate(D2, gates + D, gates + D);                     \
+  GET_Ct(ct_1, gates, ct);                                \
+  /* get ogated*/                                         \
+  blas.VMUL(D, wc_data + D2, ct, gates + D);              \
+  blas.VADD(D, gates + D, gates + D3, gates + D3);        \
+  act_gate(D, gates + D3, gates + D3);                    \
+  GET_Ht(ct, gates, ht)
  void SeqCompute(const framework::ExecutionContext& ctx) const {
    using DeviceContext = paddle::platform::CPUDeviceContext;
    INIT_BASE_INPUT_OUTPUT
    INIT_BASE_SIZES
    INIT_VEC_FUNC
+    INIT_BASE_INPUT_DATAS
    auto x_lod = x->lod();
    const int total_T = x_dims[0];
-    const int N = x_lod[0].size() - 1;  // batch size
+    const int N = x_lod[0].size() - 1;
-    const T* x_data = x->data<T>();
    const T* h0_data = h0 ? h0->data<T>() : nullptr;
    const T* c0_data = c0 ? c0->data<T>() : nullptr;
-    const T* wx_data = wx->data<T>();
+    T* xx_data = xx->mutable_data<T>(place);
-    const T* wh_data = wh->data<T>();
+    T* h_out_data = hidden_out->mutable_data<T>(place);
-    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* c_out_data = cell_out->mutable_data<T>(place);
-    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
-    T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
    auto blas = math::GetBlas<DeviceContext, T>(ctx);
    math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
                                      xx_data, bias->data<T>());
    int xx_offset = D4;
    int gate_offset = D;
    if (is_reverse) {
      const int offset = (total_T - 1) * D;
      xx_data = xx_data + offset * 4;
-      hidden_out_data = hidden_out_data + offset;
+      h_out_data = h_out_data + offset;
-      cell_out_data = cell_out_data + offset;
+      c_out_data = c_out_data + offset;
      xx_offset = -D4;
      gate_offset = -D;
    }
-    auto move_step = [&]() {
+#define MOVE_ONE_STEP                    \
-      xx_data = xx_data + xx_offset;
+  prev_h_data = h_out_data;              \
-      hidden_out_data = hidden_out_data + gate_offset;
+  prev_c_data = c_out_data;              \
-      cell_out_data = cell_out_data + gate_offset;
+  xx_data = xx_data + xx_offset;         \
-    };
+  h_out_data = h_out_data + gate_offset; \
+  c_out_data = c_out_data + gate_offset
-    for (int i = 0; i < N; ++i) {
-      int bid = is_reverse ? N - 1 - i : i;
+#define PROCESS_H0C0_DEFINES                       \
-      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
+  int bid = is_reverse ? N - 1 - i : i;            \
-      const T* prev_c_data = nullptr;
+  int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \
-      const T* prev_h_data = nullptr;
+  const T* prev_c_data = nullptr;                  \
-      int tstart = 0;
+  const T* prev_h_data = nullptr;                  \
-      if (h0_data) {
+  int tstart = 0
-        prev_h_data = h0_data + bid * D;
-        prev_c_data = c0_data + bid * D;
+#define PROCESS_H0C0_PEEPHOLE                                      \
-      } else {
+  PROCESS_H0C0_DEFINES;                                            \
-        // W_ch, W_ih, W_fh, W_oh
+  if (h0_data) {                                                   \
-        act_gate(D3, xx_data + D, xx_data + D);
+    prev_h_data = h0_data + bid * D;                               \
-        act_cand(D, xx_data, xx_data);
+    prev_c_data = c0_data + bid * D;                               \
-        // cell out= input*tilde
+  } else {                                                         \
-        blas.VMUL(D, xx_data, xx_data + D, cell_out_data);
+    COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
-        // hidden out= act_state(cellout) * outgate
+    MOVE_ONE_STEP;                                                 \
-        act_cell(D, cell_out_data, xx_data + D2);
+    tstart = 1;                                                    \
-        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
+  }
-        // prev
-        prev_h_data = hidden_out_data;
-        prev_c_data = cell_out_data;
-        tstart = 1;
-        move_step();
-      }
-      for (int step = tstart; step < seq_len; ++step) {
-        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
-                  prev_h_data, D, wh_data, D4, static_cast<T>(1), xx_data, D4);
-        // W_ch, W_ih, W_fh, W_oh
-        act_gate(D3, xx_data + D, xx_data + D);
-        act_cand(D, xx_data, xx_data);
-        // a = forget * prev_cell
-        blas.VMUL(D, xx_data + D2, prev_c_data, xx_data + D2);
-        // b = input * tilde
-        blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
-        // cell out= a+b
-        blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data);
-        // hidden out= act_state(cellout) * outgate
-        act_cell(D, cell_out_data, xx_data + D2);
-        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
-        // prev
+#define PROCESS_H0C0                                      \
-        prev_h_data = hidden_out_data;
+  PROCESS_H0C0_DEFINES;                                   \
-        prev_c_data = cell_out_data;
+  if (h0_data) {                                          \
+    prev_h_data = h0_data + bid * D;                      \
+    prev_c_data = c0_data + bid * D;                      \
+  } else {                                                \
+    COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
+    MOVE_ONE_STEP;                                        \
+    tstart = 1;                                           \
+  }
-        move_step();
+    if (use_peepholes) {
+      for (int i = 0; i < N; ++i) {
+        PROCESS_H0C0_PEEPHOLE
+        for (int step = tstart; step < seq_len; ++step) {
+          GEMM_WH_ADDON(1, prev_h_data, xx_data);
+          COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data);
+          MOVE_ONE_STEP;
+        }
+      }
+    } else {
+      for (int i = 0; i < N; ++i) {
+        PROCESS_H0C0
+        for (int step = tstart; step < seq_len; ++step) {
+          GEMM_WH_ADDON(1, prev_h_data, xx_data);
+          COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data);
+          MOVE_ONE_STEP;
+        }
      }
    }
+#undef PROCESS_H0C0_DEFINES
+#undef PROCESS_H0C0_PEEPHOLE
+#undef PROCESS_H0C0
+#undef MOVE_ONE_STEP
  }
  void BatchCompute(const framework::ExecutionContext& ctx) const {
@@ -357,17 +422,13 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
    }
    INIT_BASE_SIZES
    INIT_VEC_FUNC
+    INIT_BASE_INPUT_DATAS
    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
    auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
    auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
    auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
-    const T* x_data = x->data<T>();
-    const T* wx_data = wx->data<T>();
-    const T* wh_data = wh->data<T>();
-    auto place = ctx.GetPlace();
    T* xx_data = xx->mutable_data<T>(place);
    T* batched_input_data = batched_input->mutable_data<T>(place);
    T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
@@ -419,17 +480,14 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
      T* cur_in_data = batched_input_data;
      T* cur_h_out_data = batched_h_out_data;
      T* cur_c_out_data = batched_c_out_data;
-      // W_ch, W_ih, W_fh, W_oh
      for (int i = 0; i < max_bs; ++i) {
-        act_gate(D3, cur_in_data + D, cur_in_data + D);
+        GET_Ct_NOH0C0(cur_in_data, cur_c_out_data);
-        act_cand(D, cur_in_data, cur_in_data);
+        if (use_peepholes) {
-        // cell out= input*tilde
+          blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D);
-        blas.VMUL(D, cur_in_data, cur_in_data + D, cur_c_out_data);
+          blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3);
-        // hidden out= act_state(cellout) * outgate
+        }
-        act_cell(D, cur_c_out_data, cur_in_data + D2);
+        act_gate(D, cur_in_data + D3, cur_in_data + D3);
-        blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
+        GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data);
-        // add offset
        cur_in_data += D4;
        cur_c_out_data += D;
        cur_h_out_data += D;
@@ -438,49 +496,60 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
      prev_h_data = batched_h_out_data;
      prev_c_data = batched_c_out_data;
    }
-    // Then start from next
    const auto& batch_starts = batched_lod[0];
    const int max_seq_len = batch_starts.size() - 1;
    const int offset = tstart * max_bs * D;
    batched_input_data = batched_input_data + offset * 4;
    batched_h_out_data = batched_h_out_data + offset;
    batched_c_out_data = batched_c_out_data + offset;
-    for (int step = tstart; step < max_seq_len; ++step) {
-      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D4, D, static_cast<T>(1),
-                prev_h_data, D, wh_data, D4, static_cast<T>(1),
-                batched_input_data, D4);
-      T* cur_in_data = batched_input_data;
+#define DEFINE_CUR                        \
-      T* cur_prev_c_data = prev_c_data;
+  T* cur_in_data = batched_input_data;    \
-      T* cur_c_out_data = batched_c_out_data;
+  T* cur_prev_c_data = prev_c_data;       \
-      T* cur_h_out_data = batched_h_out_data;
+  T* cur_c_out_data = batched_c_out_data; \
-      for (int i = 0; i < cur_bs; ++i) {
+  T* cur_h_out_data = batched_h_out_data
-        // W_ch, W_ih, W_fh, W_oh
-        act_gate(D3, cur_in_data + D, cur_in_data + D);
+#define MOVE_ONE_BATCH  \
-        act_cand(D, cur_in_data, cur_in_data);
+  cur_in_data += D4;    \
-        // a = forget * prev_cell
+  cur_prev_c_data += D; \
-        blas.VMUL(D, cur_in_data + D2, cur_prev_c_data, cur_in_data + D2);
+  cur_c_out_data += D;  \
-        // b = input * tilde
+  cur_h_out_data += D
-        blas.VMUL(D, cur_in_data, cur_in_data + D, cur_in_data + D);
-        // cell out= a+b
+#define MOVE_ONE_STEP                  \
-        blas.VADD(D, cur_in_data + D, cur_in_data + D2, cur_c_out_data);
+  prev_c_data = batched_c_out_data;    \
-        // hidden out= act_state(cellout) * outgate
+  prev_h_data = batched_h_out_data;    \
-        act_cell(D, cur_c_out_data, cur_in_data + D2);
+  batched_c_out_data = cur_c_out_data; \
-        blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
+  batched_h_out_data = cur_h_out_data; \
+  batched_input_data = cur_in_data
-        cur_in_data += D4;
-        cur_prev_c_data += D;
+    if (use_peepholes) {
-        cur_c_out_data += D;
+      for (int step = tstart; step < max_seq_len; ++step) {
-        cur_h_out_data += D;
+        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
+        DEFINE_CUR;
+        for (int i = 0; i < cur_bs; ++i) {
+          COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data,
+                                cur_h_out_data);
+          MOVE_ONE_BATCH;
+        }
+        MOVE_ONE_STEP;
+      }
+    } else {
+      for (int step = tstart; step < max_seq_len; ++step) {
+        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
+        DEFINE_CUR;
+        for (int i = 0; i < cur_bs; ++i) {
+          COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
+                       cur_h_out_data);
+          MOVE_ONE_BATCH;
+        }
+        MOVE_ONE_STEP;
      }
-      prev_c_data = batched_c_out_data;
-      prev_h_data = batched_h_out_data;
-      batched_c_out_data = cur_c_out_data;
-      batched_h_out_data = cur_h_out_data;
-      batched_input_data = cur_in_data;
    }
+#undef MOVE_ONE_STEP
+#undef MOVE_ONE_BATCH
+#undef DEFINE_CUR
    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
    batched_h_out->set_lod(batched_lod);
@@ -496,6 +565,16 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
      BatchCompute(ctx);
    }
  }
+#undef COMPUTE_CtHt_PEEPHOLE
+#undef COMPUTE_CtHt
+#undef GET_Ct_NOH0C0
+#undef COMPUTE_CtHt_NOH0C0
+#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
+#undef GET_Ht
+#undef GET_Ct
+#undef GEMM_WH_ADDON
+#undef INIT_BASE_INPUT_DATAS
 #undef INIT_BASE_SIZES
 #undef INIT_BASE_INPUT_OUTPUT
 #undef INIT_VEC_FUNC

--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -67,27 +67,27 @@ template <typename T, int BlockDim>
 __global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
                                 T *y, T *mean, T *var, float epsilon,
                                 int feature_size) {
-  using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
+  using BlockReduce = cub::BlockReduce<PairForLayerNorm<double>, BlockDim>;
  __shared__ typename BlockReduce::TempStorage temp_storage;
  int beg_idx = blockIdx.x * feature_size + threadIdx.x;
  int end_idx = (blockIdx.x + 1) * feature_size;
  // Step 1: Reduce to calculate mean and var
-  T mean_val = static_cast<T>(0);
+  double mean_val = 0;
-  T var_val = static_cast<T>(0);
+  double var_val = 0;
  for (int i = beg_idx; i < end_idx; i += BlockDim) {
    T tmp = x[i];
    mean_val += tmp;
    var_val += (tmp * tmp);
  }
  auto pair = BlockReduce(temp_storage)
-                  .Reduce(PairForLayerNorm<T>(mean_val, var_val),
+                  .Reduce(PairForLayerNorm<double>(mean_val, var_val),
-                          PairForLayerNormAddFunctor<T>());
+                          PairForLayerNormAddFunctor<double>());
  if (threadIdx.x == 0) {
    auto tmp = pair.first_ / feature_size;
-    mean[blockIdx.x] = tmp;
+    mean[blockIdx.x] = static_cast<T>(tmp);
-    var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp;
+    var[blockIdx.x] = static_cast<T>(pair.second_ / feature_size - tmp * tmp);
  }
  __syncthreads();
  mean_val = mean[blockIdx.x];

--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -57,7 +57,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
          memset(output + i * row_width, 0, row_width * sizeof(T));
        } else {
          PADDLE_ENFORCE_LT(ids[i], row_number);
-          PADDLE_ENFORCE_GE(ids[i], 0);
+          PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
          memcpy(output + i * row_width, table + ids[i] * row_width,
                 row_width * sizeof(T));
        }

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -246,6 +246,88 @@ class ReshapeGradKernel {
  }
 };
+// FIXME(zcd): reshape2 adds an intermediate output(XShape) based on reshape,
+// the XShape is used to carry the shape and lod of X which will be used in
+// reshape_grad, in this way, the framework can reuse the memory of X
+// immediately the reshape_op is finished.
+// Considering compatibility issues, we could not fix reshape_op
+class Reshape2Op : public ReshapeOp {
+ public:
+  Reshape2Op(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : ReshapeOp(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ReshapeOp::InferShape(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of ReshapeOp should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+class Reshape2OpMaker : public ReshapeOpMaker {
+ public:
+  void Make() override {
+    ReshapeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+class Reshape2GradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("reshape2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+class Reshape2GradOp : public framework::OperatorWithKernel {
+ public:
+  Reshape2GradOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = ctx->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
@@ -261,6 +343,17 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                               ops::ReshapeGradKernel, int64_t,
                               ops::ReshapeGradKernel);
+REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
+                  ops::Reshape2GradMaker);
+REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
 #ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
                                ops::ReshapeKernel, int, ops::ReshapeKernel,
@@ -269,4 +362,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                double, ops::ReshapeGradKernel, int,
                                ops::ReshapeGradKernel, int64_t,
                                ops::ReshapeGradKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
 #endif
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
@@ -36,9 +36,13 @@ class RmspropOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                   "Output(param_out) of RmspropOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(Momentum_out) of RmspropOp should not be null.");
+                   "Output(MomentOut) of RmspropOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
                   "Output(MeanSquareOut) of RmspropOp should not be null.");
+    if (ctx->Attrs().Get<bool>("centered")) {
+      PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"),
+                     "Output(MeanGradOut) of RmspropOp should not be null.");
+    }
    auto param_dim = ctx->GetInputDim("Param");
    PADDLE_ENFORCE_EQ(
@@ -58,6 +62,9 @@ class RmspropOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("ParamOut", param_dim);
    ctx->SetOutputDim("MomentOut", param_dim);
    ctx->SetOutputDim("MeanSquareOut", param_dim);
+    if (ctx->Attrs().Get<bool>("centered")) {
+      ctx->SetOutputDim("MeanGradOut", param_dim);
+    }
  }
 };
@@ -70,6 +77,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("MeanSquare",
             "(Tensor, default Tensor<float>)"
             " The mean square value that gets updated.");
+    AddInput("MeanGrad",
+             "(Tensor, default Tensor<float>)"
+             " The moving average of gradient")
+        .AsDispensable();
    AddInput("LearningRate",
             "(Tensor, default Tensor<float>) "
             "The learning rate should be a tensor of size 1.");
@@ -82,6 +93,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
    AddOutput("MomentOut", "(Tensor) Output updated moment.");
    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
+    AddOutput("MeanGradOut",
+              "(Tensor) Output moving average of gradient updated value.");
    AddAttr<float>("epsilon",
                   "(float, default 1e-10) Constant "
@@ -93,6 +106,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(0.9f);
    AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
        .SetDefault(0.0f);
+    AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
+        .SetDefault(false);
    AddComment(R"DOC(
 Rmsprop Optimizer. 
@@ -103,6 +118,14 @@ MomentOut = momentum * Moment +
 ParamOut = Param -  MomentOut
 $$
+if centered is true:
+mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
+mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
+mom = momentum * mom{t-1} + learning_rate * g_t /
+    sqrt(mean_square - mean_grad**2 + epsilon)
+param -= mom
 The original slides that proposed Rmsprop: Slide 29 of
 http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)

--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -41,6 +41,7 @@ class RmspropOpKernel : public framework::OpKernel<T> {
    float epsilon = ctx.Attr<float>("epsilon");
    float rho = ctx.Attr<float>("decay");
    float momentum = ctx.Attr<float>("momentum");
+    bool centered = ctx.Attr<bool>("centered");
    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
    auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
@@ -53,12 +54,24 @@ class RmspropOpKernel : public framework::OpKernel<T> {
    auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+    Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
    ms_out.device(place) = rho * ms + (1 - rho) * g * g;
-    mom_out.device(place) =
+    if (centered) {
-        momentum * mom +
+      auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad"));
-        lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+      auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
+      mean_grad_out->mutable_data<T>(ctx.GetPlace());
+      auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
+      mg_out.device(place) = rho * mg + (1 - rho) * g;
+      mom_out.device(place) = momentum * mom +
+                              lr.broadcast(grad_dsize) * g /
+                                  (ms_out - mg_out.square() + epsilon).sqrt();
+    } else {
+      mom_out.device(place) =
+          momentum * mom +
+          lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+    }
    p_out.device(place) = p - mom_out;
  }
 };

--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -126,15 +126,15 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault({});
    AddComment(R"DOC(
        Squeeze Operator.
-        Remove single-dimensional entries from the shape of a tensor. 
+        Remove single-dimensional entries from the shape of a tensor.
-        Takes a parameter axes with a list of axes to squeeze. 
+        Takes a parameter axes with a list of axes to squeeze.
-        If axes is not provided, all the single dimensions will be removed from the shape. 
+        If axes is not provided, all the single dimensions will be removed from the shape.
        If an axis is selected with shape entry not equal to one, an error is raised.
        Examples:
        Case 1:
-          Given 
+          Given
            X.shape = (1, 3, 1, 5)
          and
            axes = [0]
@@ -144,7 +144,7 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
        Case 2:
          Given
            X.shape = (1, 3, 1, 5)
-          and 
+          and
            axes = []
          we get:
            Out.shape = (3, 5)
@@ -181,6 +181,113 @@ class SqueezeGradOp : public framework::OperatorBase {
  }
 };
+// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
+// the XShape is used to carry the shape and lod of X which will be used in
+// squeeze_grad, in this way, the framework can reuse the memory of X
+// immediately the squeeze2_op is finished.
+// Considering compatibility issues, we could not fix squeeze2_op
+class Squeeze2OpMaker : public SqueezeOpMaker {
+ public:
+  void Make() override {
+    SqueezeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in SqueezeGradOp.")
+        .AsIntermediate();
+  }
+};
+class Squeeze2OpInferShape : public SqueezeOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    SqueezeOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of Squeeze operator should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+class Squeeze2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims);
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+class Squeeze2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("squeeze2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+class Squeeze2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+class Squeeze2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -192,3 +299,8 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
                  ops::SqueezeOpInferShape,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
+REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
+                  ops::Squeeze2OpInferShape, ops::Squeeze2GradOpMaker);
+REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
+                  ops::Squeeze2GradInferShape);
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/transpose_op.h"
+#include <string>
 #include <vector>
 namespace paddle {
@@ -24,7 +25,7 @@ class TransposeOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
    auto x_dims = ctx->GetInputDim("X");
@@ -90,7 +91,7 @@ The behavior of this operator is similar to how `numpy.transpose` works.
         2 &5
    \end{pmatrix}$$
- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is 
+- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is
 $[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$.
 )DOC");
@@ -101,7 +102,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null");
@@ -113,6 +114,93 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
  }
 };
+// FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
+// transpose, the XShape is used to carry the shape and lod of X which
+// will be used in transpose_grad, in this way, the framework can reuse
+// the memory of X immediately the transpose2_op is finished.
+// Considering compatibility issues, we could not fix transpose2_op
+class Transpose2Op : public TransposeOp {
+ public:
+  Transpose2Op(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : TransposeOp(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    TransposeOp::InferShape(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) should not be null");
+    const auto &in_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> x_shape_dim(in_dims.size() + 1);
+    x_shape_dim[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      x_shape_dim[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(x_shape_dim));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+class Transpose2OpMaker : public TransposeOpMaker {
+ public:
+  void Make() override {
+    TransposeOpMaker::Make();
+    AddOutput("XShape", "(Tensor)The output tensor.").AsIntermediate();
+  }
+};
+class Transpose2GradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("transpose2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+class Transpose2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      auto xshape_dim = ctx->GetInputDim("XShape");
+      auto x_shape_dim =
+          framework::slice_ddim(xshape_dim, 1, xshape_dim.size());
+      ctx->SetOutputDim(framework::GradVarName("X"), x_shape_dim);
+      ctx->ShareLoD("XShape", framework::GradVarName("X"));
+    }
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -120,8 +208,20 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
 REGISTER_OP_CPU_KERNEL(
    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
    transpose_grad,
    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
+                  ops::Transpose2GradMaker);
+REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad);
+REGISTER_OP_CPU_KERNEL(
+    transpose2,
+    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
@@ -21,3 +21,10 @@ REGISTER_OP_CUDA_KERNEL(
 REGISTER_OP_CUDA_KERNEL(
    transpose_grad,
    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    transpose2,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -127,13 +127,13 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
        });
    AddComment(R"DOC(
    Unsqueeze Operator.
-    Insert single-dimensional entries to the shape of a tensor. 
-    Takes one required argument axes, a list of dimensions that will be inserted. 
-    Dimension indices in axes are as seen in the output tensor. 
-    For example: 
+    Insert single-dimensional entries to the shape of a tensor.
-      Given a tensor such that tensor with shape [3, 4, 5], 
+    Takes one required argument axes, a list of dimensions that will be inserted.
+    Dimension indices in axes are as seen in the output tensor.
+    For example:
+      Given a tensor such that tensor with shape [3, 4, 5],
      then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
    )DOC");
  }
@@ -168,6 +168,112 @@ class UnsqueezeGradOp : public framework::OperatorBase {
  }
 };
+// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
+// unsqueeze, the XShape is used to carry the shape and lod of X which
+// will be used in unsqueeze_grad, in this way, the framework can reuse
+// the memory of X immediately the unsqueeze2_op is finished.
+// Considering compatibility issues, we could not fix unsqueeze2_op
+class Unsqueeze2OpInferShape : public UnsqueezeOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    UnsqueezeOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of Unsqueeze operator should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
+ public:
+  void Make() override {
+    UnsqueezeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in UnsqueezeGradOp.")
+        .AsIntermediate();
+  }
+};
+class Unsqueeze2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = Unsqueeze2OpInferShape::GetOutputShape(axes, x_dims);
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape op.
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+class Unsqueeze2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("unsqueeze2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+class Unsqueeze2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+class Unsqueeze2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -180,3 +286,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
                  ops::UnsqueezeGradInferShape);
+REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
+                  ops::Unsqueeze2OpInferShape, ops::Unsqueeze2GradOpMaker);
+REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
+                  ops::Unsqueeze2GradInferShape);
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -121,6 +121,12 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
    if (nullptr == dso_handle) {
      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
                   << dlerror() << ")";
+      if (dlPath.find("nccl") != std::string::npos) {
+        std::cout
+            << "You may need to install 'nccl2' from NVIDIA official website: "
+            << "https://developer.nvidia.com/nccl/nccl-download"
+            << "before install PaddlePaddle" << std::endl;
+      }
      dlPath = dso_name;
      dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
    }

--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -100,14 +100,13 @@ struct NCCLContextMap {
      return;
    }
    std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
-    // if pass nccl_id here, can assume we are doing multi node training
+    // if num_trainers == 1, should create a new nccl id for local comms.
-    if (nccl_id == nullptr) {
+    if (num_trainers == 1) {
      std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
      PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
          comms.get(), static_cast<int>(order_.size()), order_.data()));
    } else {
-      PADDLE_ENFORCE_GT(num_trainers, 1);
+      PADDLE_ENFORCE_NOT_NULL(nccl_id);
-      // TODO(wuyi): need to ensure each node have same number of GPUs
      {
        int nranks = num_trainers * order_.size();
        NCCLGroupGuard gurad;

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -115,6 +115,7 @@ function cmake_gen() {
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
        -DPY_VERSION=${PY_VERSION:-2.7}
    ========================================
@@ -144,6 +145,7 @@ EOF
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
+        -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
        -DPY_VERSION=${PY_VERSION:-2.7}
 }
@@ -498,7 +500,7 @@ EOF
 EOF
    if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.1.2-1+cuda${CUDA_MAJOR} libnccl-dev=2.1.2-1+cuda${CUDA_MAJOR} &&"
+        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&"
    else
        NCCL_DEPS=""
    fi
@@ -545,14 +547,14 @@ function gen_capi_package() {
        rm -rf $install_prefix
        make DESTDIR="$install_prefix" install
        cd $install_prefix/usr/local
-        ls | egrep -v "^Found.*item$" | xargs tar -cf ${PADDLE_ROOT}/build/paddle.tgz
+        ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz
    fi
 }
 function gen_fluid_inference_lib() {
    mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
-    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
        cat <<EOF
    ========================================
    Deploying fluid inference library ...
@@ -567,7 +569,7 @@ EOF
 }
 function test_fluid_inference_lib() {
-    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
        cat <<EOF
    ========================================
    Testing fluid inference library ...

--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -104,7 +104,7 @@ def batch_images_from_tar(data_file,
                pickle.dump(
                    output,
                    open('%s/batch_%d' % (out_path, file_id), 'wb'),
-                    protocol=pickle.HIGHEST_PROTOCOL)
+                    protocol=2)
                file_id += 1
                data = []
                labels = []
@@ -113,9 +113,7 @@ def batch_images_from_tar(data_file,
        output['label'] = labels
        output['data'] = data
        pickle.dump(
-            output,
+            output, open('%s/batch_%d' % (out_path, file_id), 'wb'), protocol=2)
-            open('%s/batch_%d' % (out_path, file_id), 'wb'),
-            protocol=pickle.HIGHEST_PROTOCOL)
    with open(meta_file, 'a') as meta:
        for file in os.listdir(out_path):

--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -78,7 +78,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
    return acc_out
-def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
+def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
    """
    **Area Under the Curve (AUC) Layer**
@@ -118,16 +118,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
    """
    helper = LayerHelper("auc", **locals())
    auc_out = helper.create_tmp_variable(dtype="float64")
+    batch_auc_out = helper.create_tmp_variable(dtype="float64")
    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
-    tp = helper.create_global_variable(
+    stat_pos = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
+        persistable=True, dtype='int64', shape=[num_thresholds + 1])
-    tn = helper.create_global_variable(
+    stat_neg = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
+        persistable=True, dtype='int64', shape=[num_thresholds + 1])
-    fp = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
+    for var in [stat_pos, stat_neg]:
-    fn = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
-    for var in [tp, tn, fp, fn]:
        helper.set_variable_initializer(
            var, Constant(
                value=0.0, force_cpu=True))
@@ -137,18 +135,15 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
        inputs={
            "Predict": [input],
            "Label": [label],
-            "TP": [tp],
+            "StatPos": [stat_pos],
-            "TN": [tn],
+            "StatNeg": [stat_neg]
-            "FP": [fp],
-            "FN": [fn]
        },
        attrs={"curve": curve,
               "num_thresholds": num_thresholds},
        outputs={
            "AUC": [auc_out],
-            "TPOut": [tp],
+            "BatchAUC": [batch_auc_out],
-            "TNOut": [tn],
+            "StatPosOut": [stat_pos],
-            "FPOut": [fp],
+            "StatNegOut": [stat_neg]
-            "FNOut": [fn]
        })
-    return auc_out, [tp, tn, fp, fn]
+    return auc_out, batch_auc_out, [stat_pos, stat_neg]
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3546,11 +3546,6 @@ def topk(input, k, name=None):
            top5_values, top5_indices = layers.topk(input, k=5)
    """
-    shape = input.shape
-    if k < 1 or k >= shape[-1]:
-        raise ValueError("k must be greater than 0 and less than %d." %
-                         (shape[-1]))
    helper = LayerHelper("top_k", **locals())
    values = helper.create_tmp_variable(dtype=input.dtype)
    indices = helper.create_tmp_variable(dtype="int64")
@@ -4030,10 +4025,12 @@ def transpose(x, perm, name=None):
    helper = LayerHelper('transpose', **locals())
    out = helper.create_tmp_variable(x.dtype)
+    x_shape = helper.create_tmp_variable(x.dtype)
    helper.append_op(
-        type='transpose',
+        type='transpose2',
        inputs={'X': [x]},
-        outputs={'Out': [out]},
+        outputs={'Out': [out],
+                 'XShape': [x_shape]},
        attrs={'axis': perm})
    return out
@@ -4503,7 +4500,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
    """
    if not (isinstance(shape, list) or isinstance(shape, tuple)):
-        raise ValueError("Input shape must be a python lsit or tuple.")
+        raise ValueError("Input shape must be a python list or tuple.")
    inputs = {"X": x}
    if isinstance(actual_shape, Variable):
        inputs["Shape"] = actual_shape
@@ -4525,13 +4522,15 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                "Each dimension size given in shape must not be negtive "
                "except one unknown dimension.")
-    helper = LayerHelper("reshape", **locals())
+    helper = LayerHelper("reshape2", **locals())
    out = helper.create_tmp_variable(dtype=x.dtype)
+    x_shape = helper.create_tmp_variable(dtype=x.dtype)
    helper.append_op(
-        type="reshape",
+        type="reshape2",
        inputs=inputs,
        attrs={"shape": shape},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})
    return helper.append_activation(out)
@@ -4575,11 +4574,13 @@ def squeeze(input, axes, name=None):
    """
    helper = LayerHelper("squeeze", **locals())
    out = helper.create_tmp_variable(dtype=input.dtype)
+    x_shape = helper.create_tmp_variable(dtype=input.dtype)
    helper.append_op(
-        type="squeeze",
+        type="squeeze2",
        inputs={"X": input},
        attrs={"axes": axes},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})
    return out
@@ -4610,11 +4611,13 @@ def unsqueeze(input, axes, name=None):
    """
    helper = LayerHelper("unsqueeze", **locals())
    out = helper.create_tmp_variable(dtype=input.dtype)
+    x_shape = helper.create_tmp_variable(dtype=input.dtype)
    helper.append_op(
-        type="unsqueeze",
+        type="unsqueeze2",
        inputs={"X": input},
        attrs={"axes": axes},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})
    return out
@@ -5816,10 +5819,12 @@ def flatten(x, axis=1, name=None):
        raise ValueError("The axis should be a int, and in range [0, rank(x)]")
    out = helper.create_tmp_variable(x.dtype)
+    x_shape = helper.create_tmp_variable(x.dtype)
    helper.append_op(
-        type='flatten',
+        type='flatten2',
        inputs={"X": x},
-        outputs={'Out': out},
+        outputs={'Out': out,
+                 'XShape': x_shape},
        attrs={"axis": axis})
    return out

--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -558,8 +558,6 @@ class Auc(MetricBase):
        name: metric name
        curve: Specifies the name of the curve to be computed, 'ROC' [default] or
          'PR' for the Precision-Recall-curve.
-        num_thresholds: The number of thresholds to use when discretizing the roc
-            curve.
    "NOTE: only implement the ROC curve type via Python now."
@@ -574,15 +572,14 @@ class Auc(MetricBase):
                numpy_auc = metric.eval()
    """
-    def __init__(self, name, curve='ROC', num_thresholds=200):
+    def __init__(self, name, curve='ROC', num_thresholds=4095):
        super(Auc, self).__init__(name=name)
        self._curve = curve
        self._num_thresholds = num_thresholds
-        self._epsilon = 1e-6
-        self.tp_list = np.zeros((num_thresholds, ))
+        _num_pred_buckets = num_thresholds + 1
-        self.fn_list = np.zeros((num_thresholds, ))
+        self._stat_pos = [0] * _num_pred_buckets
-        self.tn_list = np.zeros((num_thresholds, ))
+        self._stat_neg = [0] * _num_pred_buckets
-        self.fp_list = np.zeros((num_thresholds, ))
    def update(self, preds, labels):
        if not _is_numpy_(labels):
@@ -590,41 +587,32 @@ class Auc(MetricBase):
        if not _is_numpy_(preds):
            raise ValueError("The 'predictions' must be a numpy ndarray.")
-        kepsilon = 1e-7  # to account for floating point imprecisions
+        for i, lbl in enumerate(labels):
-        thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1)
+            value = preds[i, 1]
-                      for i in range(self._num_thresholds - 2)]
+            bin_idx = int(value * self._num_thresholds)
-        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+            assert bin_idx <= self._num_thresholds
+            if lbl:
-        # calculate TP, FN, TN, FP count
+                self._stat_pos[bin_idx] += 1.0
-        for idx_thresh, thresh in enumerate(thresholds):
+            else:
-            tp, fn, tn, fp = 0, 0, 0, 0
+                self._stat_neg[bin_idx] += 1.0
-            for i, lbl in enumerate(labels):
-                if lbl:
+    @staticmethod
-                    if preds[i, 1] >= thresh:
+    def trapezoid_area(x1, x2, y1, y2):
-                        tp += 1
+        return abs(x1 - x2) * (y1 + y2) / 2.0
-                    else:
-                        fn += 1
-                else:
-                    if preds[i, 1] >= thresh:
-                        fp += 1
-                    else:
-                        tn += 1
-            self.tp_list[idx_thresh] += tp
-            self.fn_list[idx_thresh] += fn
-            self.tn_list[idx_thresh] += tn
-            self.fp_list[idx_thresh] += fp
    def eval(self):
-        epsilon = self._epsilon
+        tot_pos = 0.0
-        num_thresholds = self._num_thresholds
+        tot_neg = 0.0
-        tpr = (self.tp_list.astype("float32") + epsilon) / (
+        auc = 0.0
-            self.tp_list + self.fn_list + epsilon)
-        fpr = self.fp_list.astype("float32") / (
+        idx = self._num_thresholds
-            self.fp_list + self.tn_list + epsilon)
+        while idx >= 0:
-        rec = (self.tp_list.astype("float32") + epsilon) / (
+            tot_pos_prev = tot_pos
-            self.tp_list + self.fp_list + epsilon)
+            tot_neg_prev = tot_neg
+            tot_pos += self._stat_pos[idx]
-        x = fpr[:num_thresholds - 1] - fpr[1:]
+            tot_neg += self._stat_neg[idx]
-        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
+            auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos,
-        auc_value = np.sum(x * y)
+                                       tot_pos_prev)
-        return auc_value
+            idx -= 1
+        return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -897,7 +897,20 @@ class RMSPropOptimizer(Optimizer):
        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
-        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) +
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
+            \\epsilon}} \\nabla Q_{i}(w)
+        w & = w - v(w, t)
+    if centered is True:
+    ..  math::
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+        g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
            \\epsilon}} \\nabla Q_{i}(w)
        w & = w - v(w, t)
@@ -915,6 +928,10 @@ class RMSPropOptimizer(Optimizer):
            avoid division by zero, set 1e-6 by default.
        momentum(float): :math:`\\beta` in equation is the momentum term,
            set 0.0 by default.
+        centered(bool): If True, gradients are normalized by the estimated variance of
+            the gradient; if False, by the uncentered second moment. Setting this to
+            True may help with training, but is slightly more expensive in terms of
+            computation and memory. Defaults to False.
    Raises:
        ValueError: If learning_rate, rho, epsilon, momentum are None.
@@ -928,12 +945,14 @@ class RMSPropOptimizer(Optimizer):
    _momentum_acc_str = "momentum"
    _mean_square_acc_str = "mean_square"
+    _mean_grad_acc_str = "mean_grad"
    def __init__(self,
                 learning_rate,
                 rho=0.95,
                 epsilon=1.0e-6,
                 momentum=0.0,
+                 centered=False,
                 **kwargs):
        super(RMSPropOptimizer, self).__init__(
            learning_rate=learning_rate, **kwargs)
@@ -950,6 +969,7 @@ class RMSPropOptimizer(Optimizer):
        self._rho = rho
        self._epsilon = epsilon
        self._momentum = momentum
+        self._centered = centered
    def _create_accumulators(self, block, parameters):
        if not isinstance(block, framework.Block):
@@ -958,6 +978,7 @@ class RMSPropOptimizer(Optimizer):
        for p in parameters:
            self._add_accumulator(self._momentum_acc_str, p)
            self._add_accumulator(self._mean_square_acc_str, p)
+            self._add_accumulator(self._mean_grad_acc_str, p)
    def _append_optimize_op(self, block, param_and_grad):
        if not isinstance(block, framework.Block):
@@ -967,6 +988,8 @@ class RMSPropOptimizer(Optimizer):
                                             param_and_grad[0])
        mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
                                                param_and_grad[0])
+        mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
+                                              param_and_grad[0])
        rmsprop_op = block.append_op(
            type=self.type,
            inputs={
@@ -974,17 +997,20 @@ class RMSPropOptimizer(Optimizer):
                "Grad": param_and_grad[1],
                "Moment": momentum_acc,
                "MeanSquare": mean_square_acc,
+                "MeanGrad": mean_grad_acc,
                "LearningRate": self._create_param_lr(param_and_grad),
            },
            outputs={
                "ParamOut": param_and_grad[0],
                "MomentOut": momentum_acc,
-                "MeanSquareOut": mean_square_acc
+                "MeanSquareOut": mean_square_acc,
+                "MeanGradOut": mean_grad_acc
            },
            attrs={
                "epsilon": self._epsilon,
                "decay": self._rho,
-                "momentum": self._momentum
+                "momentum": self._momentum,
+                "centered": self._centered
            })
        return rmsprop_op

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -43,8 +43,9 @@ class ParallelExecutor(object):
        num_trainers(int): If greater than 1, NCCL will be initialized with
            multiple rank of nodes, each node should have same number of GPUs.
            Distributed training will be enabled then. Default 1.
-        trainer_id(int: Must use together with num_trainers. trainer_id is the
+        trainer_id(int): Must use together with num_trainers. trainer_id is the
            "rank" of current node starts from 0. Default 0.
+        scope(Scope): scope to run with, default use fluid.global_scope().
    Returns:
        ParallelExecutor: The initialized ParallelExecutor object.
@@ -73,6 +74,7 @@ class ParallelExecutor(object):
                 build_strategy=None,
                 num_trainers=1,
                 trainer_id=0,
+                 scope=None,
                 **kwargs):
        if len(kwargs) != 0:
            err_msg = ""
@@ -131,7 +133,8 @@ class ParallelExecutor(object):
        main = main_program
        main = main if main else framework.default_main_program()
-        scope = executor.global_scope()
+        if scope == None:
+            scope = executor.global_scope()
        # FIXME(Yancey1989): it's a temporary approach to determinate the distribute
        # train program, call self.bcast_param() at the end of each mini-batch.
        self.is_dist = True if "recv" in [

--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -47,14 +47,14 @@ def train_program():
    loss = fluid.layers.square_error_cost(input=y_predict, label=y)
    avg_loss = fluid.layers.mean(loss)
-    return avg_loss
+    return [avg_loss, y_predict]
 def optimizer_func():
    return fluid.optimizer.SGD(learning_rate=0.001)
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, params_dirname, inference_model_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    trainer = fluid.Trainer(
@@ -74,6 +74,8 @@ def train(use_cuda, train_program, params_dirname):
                '''
                if params_dirname is not None:
                    trainer.save_params(params_dirname)
+                    trainer.save_inference_model(inference_model_dirname,
+                                                 ['x'], [1])
                trainer.stop()
    trainer.train(
@@ -99,15 +101,55 @@ def infer(use_cuda, inference_program, params_dirname=None):
    print("infer results: ", results[0])
+def infer_by_saved_model(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+        # The input's dimension should be 2-D and the second dim is 13
+        # The input data should be >= 0
+        batch_size = 10
+        test_reader = paddle.batch(
+            paddle.dataset.uci_housing.test(), batch_size=batch_size)
+        test_data = next(test_reader())
+        test_feat = numpy.array(
+            [data[0] for data in test_data]).astype("float32")
+        test_label = numpy.array(
+            [data[1] for data in test_data]).astype("float32")
+        assert feed_target_names[0] == 'x'
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: numpy.array(test_feat)},
+                          fetch_list=fetch_targets)
+        print("infer shape: ", results[0].shape)
+        print("infer results: ", results[0])
+        print("ground truth: ", test_label)
 def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
    # Directory for saving the trained model
-    params_dirname = "fit_a_line.inference.model"
+    params_dirname = "fit_a_line.model"
+    inference_model_dirname = "fit_a_line.inference_model"
-    train(use_cuda, train_program, params_dirname)
+    train(use_cuda, train_program, params_dirname, inference_model_dirname)
    infer(use_cuda, inference_program, params_dirname)
+    infer_by_saved_model(use_cuda, inference_model_dirname)
 class TestFitALine(unittest.TestCase):

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -18,6 +18,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import numpy
+import six
 import os
 import cifar10_small_test_set
@@ -177,4 +178,7 @@ if __name__ == '__main__':
        for parallel in (False, True):
            if use_cuda and not core.is_compiled_with_cuda():
                continue
-            main(use_cuda=use_cuda, parallel=parallel)
+            # TODO(minqiyang): remove this line after fixing the deletion
+            # order problem of Scope in ParallelExecutor in manylinux
+            if six.PY2:
+                main(use_cuda=use_cuda, parallel=parallel)
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -18,6 +18,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import numpy
+import six
 import os
 import cifar10_small_test_set
@@ -151,4 +152,7 @@ if __name__ == '__main__':
        for parallel in (False, True):
            if use_cuda and not core.is_compiled_with_cuda():
                continue
-            main(use_cuda=use_cuda, parallel=parallel)
+            # TODO(minqiyang): remove this line after fixing the deletion
+            # order problem of Scope in ParallelExecutor in manylinux
+            if six.PY2:
+                main(use_cuda=use_cuda, parallel=parallel)
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -18,6 +18,7 @@ import argparse
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle
+import six
 import sys
 import numpy
 import unittest
@@ -154,4 +155,7 @@ if __name__ == '__main__':
        for parallel in (False, True):
            if use_cuda and not core.is_compiled_with_cuda():
                continue
-            main(use_cuda=use_cuda, parallel=parallel)
+            # TODO(minqiyang): remove this line after fixing the deletion
+            # order problem of Scope in ParallelExecutor in manylinux
+            if six.PY2:
+                main(use_cuda=use_cuda, parallel=parallel)
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -18,6 +18,7 @@ import argparse
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle
+import six
 import sys
 import numpy
 import unittest
@@ -136,4 +137,7 @@ if __name__ == '__main__':
        for parallel in (False, True):
            if use_cuda and not core.is_compiled_with_cuda():
                continue
-            main(use_cuda=use_cuda, parallel=parallel)
+            # TODO(minqiyang): remove this line after fixing the deletion
+            # order problem of Scope in ParallelExecutor in manylinux
+            if six.PY2:
+                main(use_cuda=use_cuda, parallel=parallel)
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -36,6 +36,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import core
 from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.compat as cpt
 from paddle.compat import long_type
 import hashlib
@@ -315,8 +316,9 @@ def pad_batch_data(insts,
    """
    return_list = []
    max_len = max(len(inst) for inst in insts)
-    num_token = reduce(lambda x, y: x + y,
+    num_token = six.moves.reduce(
-                       [len(inst) for inst in insts]) if return_num_token else 0
+        lambda x, y: x + y,
+        [len(inst) for inst in insts]) if return_num_token else 0
    # Any token included in dict can be used to pad, since the paddings' loss
    # will be masked out by weights and make no effect on parameter gradients.
    inst_data = np.array(
@@ -328,7 +330,7 @@ def pad_batch_data(insts,
        return_list += [inst_weight.astype("float32").reshape([-1, 1])]
    else:  # position data
        inst_pos = np.array([
-            range(1, len(inst) + 1) + [0] * (max_len - len(inst))
+            list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
            for inst in insts
        ])
        return_list += [inst_pos.astype("int64").reshape([-1, 1])]
@@ -385,10 +387,11 @@ def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx,
        return_num_token=True)
    data_input_dict = dict(
-        zip(data_input_names, [
+        list(
-            src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
+            zip(data_input_names, [
-            trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+                src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
-        ]))
+                trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+            ])))
    return data_input_dict, np.asarray([num_token], dtype="float32")
@@ -561,7 +564,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
                        np.log(TrainTaskConfig.label_smooth_eps / (
                            ModelHyperParams.trg_vocab_size - 1) + 1e-20))
    init = False
-    for pass_id in xrange(TrainTaskConfig.pass_num):
+    for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
        pass_start_time = time.time()
        for batch_id, data in enumerate(train_data()):
            if batch_id >= 5:
@@ -587,11 +590,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
                    ModelHyperParams.eos_idx, ModelHyperParams.n_head,
                    ModelHyperParams.d_model)
                total_num_token += num_token
-                feed_kv_pairs = data_input_dict.items()
+                feed_kv_pairs = list(data_input_dict.items())
                if TrainTaskConfig.local:
-                    feed_kv_pairs += {
+                    feed_kv_pairs += list({
                        lr_scheduler.learning_rate.name: lr_rate
-                    }.items()
+                    }.items())
                feed_list.append(dict(feed_kv_pairs))
                if not init:
@@ -873,6 +876,7 @@ class DataReader(object):
            f = tarfile.open(fpaths[0], "r")
            for line in f.extractfile(tar_fname):
+                line = cpt.to_text(line)
                fields = line.strip("\n").split(self._field_delimiter)
                if (not self._only_src and len(fields) == 2) or (
                        self._only_src and len(fields) == 1):
@@ -882,8 +886,9 @@ class DataReader(object):
                if not os.path.isfile(fpath):
                    raise IOError("Invalid file: %s" % fpath)
-                with open(fpath, "r") as f:
+                with open(fpath, "rb") as f:
                    for line in f:
+                        line = cpt.to_text(line)
                        fields = line.strip("\n").split(self._field_delimiter)
                        if (not self._only_src and len(fields) == 2) or (
                                self._only_src and len(fields) == 1):
@@ -892,8 +897,9 @@ class DataReader(object):
    @staticmethod
    def load_dict(dict_path, reverse=False):
        word_dict = {}
-        with open(dict_path, "r") as fdict:
+        with open(dict_path, "rb") as fdict:
            for idx, line in enumerate(fdict):
+                line = cpt.to_text(line)
                if reverse:
                    word_dict[idx] = line.strip("\n")
                else:
@@ -1034,7 +1040,7 @@ def multi_head_attention(queries,
        # size of the input as the output dimension size.
        return layers.reshape(
            x=trans_x,
-            shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]]))
+            shape=list(map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])))
    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
        """

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -249,7 +249,7 @@ class OpTest(unittest.TestCase):
        outs, _ = self._calc_output(place)
        return outs
-    def _calc_output(self, place, parallel=False):
+    def _calc_output(self, place, parallel=False, no_check_set=None):
        program = Program()
        block = program.global_block()
@@ -273,6 +273,8 @@ class OpTest(unittest.TestCase):
        # if not, fill the fetch_list by the user configured outputs in test.
        if len(fetch_list) == 0:
            for var_name, var in six.iteritems(outputs):
+                if no_check_set is not None and var_name in no_check_set:
+                    continue
                if isinstance(var, list):
                    for v in var:
                        fetch_list.append(v)
@@ -291,11 +293,17 @@ class OpTest(unittest.TestCase):
                            return_numpy=False)
        return outs, fetch_list
-    def check_output_with_place(self, place, atol):
+    def check_output_with_place(self,
-        outs, fetch_list = self._calc_output(place)
+                                place,
+                                atol,
+                                no_check_set=None,
+                                equal_nan=False):
+        outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
            if out_name not in self.outputs:
                continue
+            if no_check_set is not None and out_name in no_check_set:
+                continue
            def find_actual(target_name, fetch_list):
                found = [
@@ -321,7 +329,7 @@ class OpTest(unittest.TestCase):
                        if isinstance(expect, tuple) else expect
                    self.assertTrue(
                        np.allclose(
-                            actual_t, expect_t, atol=atol),
+                            actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                        "Output (" + sub_out_name + ") has diff at " +
                        str(place))
                    if isinstance(expect, tuple):
@@ -337,7 +345,7 @@ class OpTest(unittest.TestCase):
                expect_t = expect[0] if isinstance(expect, tuple) else expect
                self.assertTrue(
                    np.allclose(
-                        actual_t, expect_t, atol=atol),
+                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                    "Output (" + out_name + ") has diff at " + str(place) +
                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
                    str(actual_t))
@@ -360,10 +368,10 @@ class OpTest(unittest.TestCase):
            places.append(core.CUDAPlace(0))
        return places
-    def check_output(self, atol=1e-5):
+    def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False):
        places = self._get_places()
        for place in places:
-            self.check_output_with_place(place, atol)
+            self.check_output_with_place(place, atol, no_check_set, equal_nan)
    def check_output_customized(self, checker):
        places = self._get_places()

--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
@@ -26,18 +26,15 @@ class TestAucOp(OpTest):
        pred = np.random.random((128, 2)).astype("float32")
        labels = np.random.randint(0, 2, (128, 1))
        num_thresholds = 200
-        tp = np.zeros((num_thresholds, )).astype("int64")
-        tn = np.zeros((num_thresholds, )).astype("int64")
+        stat_pos = np.zeros((num_thresholds + 1, )).astype("int64")
-        fp = np.zeros((num_thresholds, )).astype("int64")
+        stat_neg = np.zeros((num_thresholds + 1, )).astype("int64")
-        fn = np.zeros((num_thresholds, )).astype("int64")
        self.inputs = {
            'Predict': pred,
            'Label': labels,
-            'TP': tp,
+            "StatPos": stat_pos,
-            'TN': tn,
+            "StatNeg": stat_neg
-            'FP': fp,
-            'FN': fn
        }
        self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
@@ -47,11 +44,10 @@ class TestAucOp(OpTest):
        python_auc.update(pred, labels)
        self.outputs = {
-            'AUC': python_auc.eval(),
+            'AUC': np.array(python_auc.eval()),
-            'TPOut': python_auc.tp_list,
+            'BatchAUC': np.array(python_auc.eval()),
-            'FNOut': python_auc.fn_list,
+            'StatPosOut': np.array(python_auc._stat_pos),
-            'TNOut': python_auc.tn_list,
+            'StatNegOut': np.array(python_auc._stat_neg)
-            'FPOut': python_auc.fp_list
        }
    def test_check_output(self):

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -55,6 +55,7 @@ class TestDistRunnerBase(object):
        pserver_prog = t.get_pserver_program(args.current_endpoint)
        startup_prog = t.get_startup_program(args.current_endpoint,
                                             pserver_prog)
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_prog)
@@ -147,6 +148,8 @@ def runtime_main(test_class):
 import paddle.compat as cpt
+import socket
+from contextlib import closing
 class TestDistBase(unittest.TestCase):
@@ -156,13 +159,19 @@ class TestDistBase(unittest.TestCase):
    def setUp(self):
        self._trainers = 2
        self._pservers = 2
-        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
        self._python_interp = "python"
        self._sync_mode = True
        self._mem_opt = False
        self._use_reduce = False
        self._setup_config()
+    def _find_free_port(self):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.bind(('', 0))
+            return s.getsockname()[1]
    def start_pserver(self, model_file, check_error_log):
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"

--- a/python/paddle/fluid/tests/unittests/test_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
@@ -22,14 +22,17 @@ from op_test import OpTest
 class TestFlattenOp(OpTest):
    def setUp(self):
-        self.op_type = "flatten"
+        self.op_type = "flatten2"
        self.init_test_case()
        self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.in_shape).astype("float32")
+        }
    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=["XShape"])
    def test_check_grad(self):
        self.check_grad(["X"], "Out")

--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -53,11 +53,11 @@ class TestFusionLSTMOp(OpTest):
        self.M = 8
        self.D = 16
        self.has_initial_state = False
+        self.use_peepholes = False
        self.is_reverse = False
        self.act_gate = 'sigmoid'
        self.act_cell = 'tanh'
        self.act_cand = 'tanh'
-        self.use_peepholes = False
        self.set_conf()
        T = sum(self.lod[0])
@@ -159,5 +159,36 @@ class TestFusionLSTMOpBS1(TestFusionLSTMOp):
        self.D = 16
+class TestFusionLSTMOpPeepholes(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.has_initial_state = True
+class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.is_reverse = True
+class TestFusionLSTMOpPeepholesInitReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.has_initial_state = True
+        self.is_reverse = True
+class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.lod = [[2]]
+        self.D = 8
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -85,6 +85,7 @@ class TestFetchOp(unittest.TestCase):
                    assert not math.isnan(np.sum(ret[i])) and \
                           not math.isinf(np.sum(ret[i]))
+    @unittest.skip(reason="CI timeout")
    def test_fetch_op(self):
        tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
        tst_reader_iter = tst_reader()
@@ -139,6 +140,7 @@ class TestFeedParallel(unittest.TestCase):
            if batch_id == 2:
                break
+    @unittest.skip(reason="CI timeout")
    def test_feed_op(self):
        os.environ['CPU_NUM'] = str(4)
        if core.is_compiled_with_cuda():

--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
+import six
 from op_test import OpTest
@@ -62,17 +63,20 @@ class PReluTest(OpTest):
 # TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues
-#  class TestCase1(PReluTest):
+if six.PY2:
-#  def initTestCase(self):
-#  self.attrs = {'mode': "all"}
-#  class TestCase2(PReluTest):
+    class TestCase1(PReluTest):
-#  def initTestCase(self):
+        def initTestCase(self):
-#  self.attrs = {'mode': "channel"}
+            self.attrs = {'mode': "all"}
+    class TestCase2(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "channel"}
+    class TestCase3(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "element"}
-#  class TestCase3(PReluTest):
-#  def initTestCase(self):
-#  self.attrs = {'mode': "element"}
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -22,106 +22,39 @@ from op_test import OpTest
 class TestReshapeOp(OpTest):
    def setUp(self):
-        ori_shape = (2, 25)
+        self.init_data()
-        new_shape = (5, 10)
+        self.op_type = "reshape2"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
-        self.op_type = "reshape"
+        self.attrs = {"shape": self.new_shape}
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.outputs = {
-        self.attrs = {"shape": new_shape}
+            "Out": self.inputs["X"].reshape(self.infered_shape),
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-class TestReshapeOpDimInfer1(OpTest):
-    def setUp(self):
-        ori_shape = (5, 10)
-        new_shape = (5, -1, 5)
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-class TestReshapeOpDimInfer2(OpTest):
-    def setUp(self):
-        ori_shape = (2, 2, 6)
-        new_shape = (2, 0, 3, -1)
-        infered_shape = (2, 2, 3, -1)
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-class TestReshapeOpInplace(OpTest):
-    def setUp(self):
-        ori_shape = (2, 25)
-        new_shape = (5, 10)
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-class TestReshapeOpDimInferInplace1(OpTest):
-    def setUp(self):
-        ori_shape = (5, 10)
-        new_shape = (5, -1, 5)
-        self.op_type = "reshape"
+    def init_data(self):
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.ori_shape = (2, 25)
-        self.attrs = {"shape": new_shape}
+        self.new_shape = (5, 10)
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+        self.infered_shape = (5, 10)
    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
    def test_check_grad(self):
        self.check_grad(["X"], "Out")
-class TestReshapeOpDimInferInplace2(OpTest):
+class TestReshapeOpDimInfer1(TestReshapeOp):
-    def setUp(self):
+    def init_data(self):
-        ori_shape = (2, 2, 6)
+        self.ori_shape = (5, 10)
-        new_shape = (2, 0, 3, -1)
+        self.new_shape = (5, -1, 5)
-        infered_shape = (2, 2, 3, -1)
+        self.infered_shape = (5, -1, 5)
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
+class TestReshapeOpDimInfer2(TestReshapeOp):
-        self.check_grad(["X"], "Out")
+    def init_data(self):
+        self.ori_shape = (2, 2, 6)
+        self.new_shape = (2, 0, 3, -1)
+        self.infered_shape = (2, 2, 3, -1)
 class TestReshapeOpWithInputShape(OpTest):
@@ -130,20 +63,23 @@ class TestReshapeOpWithInputShape(OpTest):
        new_shape = (0, -1, 5)
        actual_shape = (2, 3, 5)
-        self.op_type = "reshape"
+        self.op_type = "reshape2"
        self.inputs = {
            "X": np.random.random(ori_shape).astype("float32"),
            "Shape": np.array(
                actual_shape, dtype="int32")
        }
        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(actual_shape),
+            'XShape': np.random.random(ori_shape).astype("float32")
+        }
    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", sum_outputs=["Out"])
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -15,90 +15,164 @@
 from __future__ import print_function
 import unittest
 import numpy as np
-from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
-class TestRmspropOp1(OpTest):
-    ''' Test RMSProp with explicit inputs
+class TestBase(unittest.TestCase):
-    '''
+    def setup(self, centered, epsilon=1e-6):
+        np.random.seed(5)  # fix seed
-    def setUp(self):
-        self.op_type = "rmsprop"
+        self.param_name = "param"
+        self.param = np.random.random((123, 321)).astype("float32")
-        param = np.random.random((123, 321)).astype("float32")
-        mean_square = np.random.random((123, 321)).astype("float32")
+        self.mean_square_name = "mean_square"
-        learning_rate = np.array([0.01]).astype("float32")
+        self.mean_square = np.random.random((123, 321)).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
+        self.mean_grad_name = "mean_grad"
+        self.mean_grad = np.random.random((123, 321)).astype("float32")
-        epsilon = 1e-6
-        decay = 0.9
+        self.lr_name = "lr"
-        momentum = 0.0
+        self.learning_rate = np.array([0.01]).astype("float32")
-        self.inputs = {
+        self.grad_name = "grad"
-            'Param': param,
+        self.grad = np.random.random((123, 321)).astype("float32")
-            'MeanSquare': mean_square,
-            'LearningRate': learning_rate,
+        self.moment_name = "moment"
-            'Grad': grad,
+        self.moment = np.zeros((123, 321)).astype("float32")
-            'Moment': moment,
-        }
+        self.epsilon = epsilon
+        self.decay = 0.9
-        self.attrs = {'epsilon': epsilon, 'decay': decay, 'momentum': momentum}
+        self.momentum = 0.0
+        self.centered = centered
-        ms_out = decay * mean_square + (1 - decay) * grad * grad
-        moment_out = momentum * moment + \
+        self.ms_out = self.decay * self.mean_square + (1 - self.decay
-            learning_rate * grad / np.sqrt(ms_out + epsilon)
+                                                       ) * self.grad * self.grad
-        param_out = param - moment_out
+        if centered:
+            self.mg_out = self.decay * self.mean_grad + (1 - self.decay
-        self.outputs = {
+                                                         ) * self.grad
-            'ParamOut': param_out,
+            self.moment_out = self.momentum * self.moment + \
-            'MomentOut': moment_out,
+                              self.learning_rate * self.grad / np.sqrt(self.ms_out - np.square(self.mg_out) + self.epsilon)
-            'MeanSquareOut': ms_out
+        else:
-        }
+            self.moment_out = self.momentum * self.moment + \
+                              self.learning_rate * self.grad / np.sqrt(self.ms_out + self.epsilon)
-    def test_check_output(self):
-        self.check_output()
+        self.param_out = self.param - self.moment_out
+    def check(self,
-class TestRmspropOp2(OpTest):
+              actual_t,
-    '''Test RMSProp with default values for attributes
+              expect_t,
-    '''
+              place,
+              out_name,
-    def setUp(self):
+              atol=1e-5,
-        self.op_type = "rmsprop"
+              equal_nan=False):
+        self.assertTrue(
-        param = np.random.random((123, 321)).astype("float32")
+            np.allclose(
-        mean_square = np.random.random((123, 321)).astype("float32")
+                actual_t, expect_t, atol=atol, equal_nan=equal_nan),
-        learning_rate = np.array([0.01]).astype("float32")
+            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
-        grad = np.random.random((123, 321)).astype("float32")
+            + str(expect_t) + "\n" + "But Got" + str(actual_t))
-        moment = np.zeros((123, 321)).astype("float32")
-        epsilon = 1.0e-10
+class TestRmspropOp(TestBase):
-        decay = 0.9
+    def check_with_place(self, place, centered, epsilon):
-        momentum = 0.0
+        self.setup(centered, epsilon)
+        scope = core.Scope()
-        self.inputs = {
-            'Param': param,
+        # create and initialize Param Variable
-            'MeanSquare': mean_square,
+        param = scope.var(self.param_name).get_tensor()
-            'LearningRate': learning_rate,
+        param.set(self.param, place)
-            'Grad': grad,
-            'Moment': moment,
+        mean_square = scope.var(self.mean_square_name).get_tensor()
-        }
+        mean_square.set(self.mean_square, place)
-        ms_out = decay * mean_square + (1 - decay) * grad * grad
+        lr = scope.var(self.lr_name).get_tensor()
-        moment_out = momentum * moment + \
+        lr.set(self.learning_rate, place)
-            learning_rate * grad / np.sqrt(ms_out + epsilon)
-        param_out = param - moment_out
+        grad = scope.var(self.grad_name).get_tensor()
+        grad.set(self.grad, place)
-        self.outputs = {
-            'ParamOut': param_out,
+        moment = scope.var(self.moment_name).get_tensor()
-            'MomentOut': moment_out,
+        moment.set(self.moment, place)
-            'MeanSquareOut': ms_out
-        }
+        # create and run sgd operator
-    def test_check_output(self):
+        if self.centered:
-        self.check_output()
+            mean_grad = scope.var(self.mean_grad_name).get_tensor()
+            mean_grad.set(self.mean_grad, place)
+            rmsprop_op = Operator(
+                "rmsprop",
+                Param=self.param_name,
+                Grad=self.grad_name,
+                MeanSquare=self.mean_square_name,
+                MeanGrad=self.mean_grad_name,
+                Moment=self.moment_name,
+                LearningRate=self.lr_name,
+                ParamOut=self.param_name,
+                MeanSquareOut=self.mean_square_name,
+                MomentOut=self.moment_name,
+                MeanGradOut=self.mean_grad_name,
+                epsilon=self.epsilon,
+                decay=self.decay,
+                momentum=self.momentum,
+                centered=True)
+        else:
+            rmsprop_op = Operator(
+                "rmsprop",
+                Param=self.param_name,
+                Grad=self.grad_name,
+                MeanSquare=self.mean_square_name,
+                Moment=self.moment_name,
+                LearningRate=self.lr_name,
+                ParamOut=self.param_name,
+                MeanSquareOut=self.mean_square_name,
+                MomentOut=self.moment_name,
+                epsilon=self.epsilon,
+                decay=self.decay,
+                momentum=self.momentum,
+                centered=False)
+        rmsprop_op.run(scope, place)
+        atol = 1e-5
+        equal_nan = False
+        if self.centered:
+            atol = 1e-3
+            equal_nan = True
+        self.check(
+            np.array(mean_square), self.ms_out, place, self.mean_square_name)
+        self.check(
+            np.array(moment),
+            self.moment_out,
+            place,
+            self.moment_name,
+            atol=atol,
+            equal_nan=equal_nan)
+        self.check(
+            np.array(param),
+            self.param_out,
+            place,
+            self.param_name,
+            atol=atol,
+            equal_nan=equal_nan)
+        if self.centered:
+            self.check(
+                np.array(mean_grad), self.mg_out, place, self.mean_grad_name)
+    def test_rmsprop(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place, False, 1e-6)
+            self.check_with_place(place, False, 1e-10)
+            self.check_with_place(place, True, 1e-6)
+            self.check_with_place(place, True, 1e-10)
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -23,14 +23,17 @@ from op_test import OpTest
 # Correct: General.
 class TestSqueezeOp(OpTest):
    def setUp(self):
-        self.op_type = "squeeze"
+        self.op_type = "squeeze2"
        self.init_test_case()
        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
    def test_check_grad(self):
        self.check_grad(["X"], "Out")

--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -22,16 +22,19 @@ from op_test import OpTest
 class TestTransposeOp(OpTest):
    def setUp(self):
        self.initTestCase()
-        self.op_type = "transpose"
+        self.op_type = "transpose2"
        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
        self.attrs = {'axis': list(self.axis)}
-        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("float32"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', sum_outputs=['Out'])
    def initTestCase(self):
        self.shape = (3, 4)

--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -24,13 +24,16 @@ from op_test import OpTest
 class TestUnsqueezeOp(OpTest):
    def setUp(self):
        self.init_test_case()
-        self.op_type = "unsqueeze"
+        self.op_type = "unsqueeze2"
        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=["XShape"])
    def test_check_grad(self):
        self.check_grad(["X"], "Out")

--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -431,6 +431,28 @@ class Trainer(object):
            exe = executor.Executor(self.place)
            io.save_persistables(exe, dirname=param_path)
+    def save_inference_model(self, param_path, feeded_var_names,
+                             target_var_indexes):
+        """
+        Save model for cpp inference into :code:`param_path`.
+        Args:
+            param_path(str): The path to save parameters.
+            feeded_var_names(list(str)): The name of the vars that you
+                need to feed in before run program.
+            target_var_indexes(list(int)): the index of target var that
+                you need to return in trainer.train_func.
+        Returns:
+            None
+        """
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            target_vars = [
+                self.train_func_outputs[index] for index in target_var_indexes
+            ]
+            io.save_inference_model(param_path, feeded_var_names, target_vars,
+                                    exe)
    @contextlib.contextmanager
    def _prog_and_scope_guard(self):
        with framework.program_guard(

--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -153,7 +153,7 @@ def block_to_code(block, block_idx):
    indent += 1
    # sort all vars
-    all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0])
+    all_vars = sorted(six.iteritems(block.vars), key=lambda x: x[0])
    for var in all_vars:
        print("{}{}".format(get_indent_space(indent), variable_to_code(var[1])))

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -300,7 +300,7 @@ class DistributeTranspiler(object):
            input_deps = grad_name_to_send_dummy_out.values()
            program.global_block().append_op(
                type="send_barrier",
-                inputs={"X": input_deps},
+                inputs={"X": list(input_deps)},
                outputs={"Out": send_barrier_out},
                attrs={
                    "endpoints": pserver_endpoints,
@@ -401,7 +401,7 @@ class DistributeTranspiler(object):
        Args:
            recv_vars (list): Variable list to recv for current trainer_id
-            eplist (list): A list of strings indicating 
+            eplist (list): A list of strings indicating
        Returns:
            Program: trainer side startup program.
@@ -455,7 +455,7 @@ class DistributeTranspiler(object):
            if len(splited_var) <= 1:
                continue
            # NOTE: if enable memory optimization, origin vars maybe removed.
-            if startup_program.global_block().vars.has_key(varname):
+            if varname in startup_program.global_block().vars:
                orig_param = startup_program.global_block().vars[varname]
            else:
                origin_param_var = self.origin_program.global_block().vars[
@@ -690,7 +690,7 @@ class DistributeTranspiler(object):
        Args:
            endpoint (str): current pserver endpoint.
        Returns:
            tuple: (main_program, startup_program), of type "Program"
        """
@@ -713,7 +713,7 @@ class DistributeTranspiler(object):
            endpoint (str): current pserver endpoint.
            pserver_program (Program): deprecated, call get_pserver_program first.
            startup_program (Program): deprecated, should pass startup_program
-                when initalizing 
+                when initalizing
        Returns:
            Program: parameter server side startup program.