Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_CudnnHolder_bug

8331e835 · Yang Yu · 56750e6a · 14242eae · 8331e835 · 8331e835
108 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -213,9 +213,11 @@ include(configure)          # add paddle env configuration
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
+endif()
+if(WITH_MKL OR WITH_MKLML)
    include(external/anakin)
 elseif()
-    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
+    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
 endif()

 include(generic)            # simplify cmake module

--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -11,6 +11,7 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
 # exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...

+
 RUN pip install -U pip
 RUN pip install -U kubernetes paddlepaddle

@@ -27,5 +28,6 @@ ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl 

 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
+ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
 ADD models/ /workspace/models/
+
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -17,7 +17,8 @@ import argparse
 __all__ = ['parse_args', ]

 BENCHMARK_MODELS = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "se_resnext", "vgg", "mnist",
+    "stacked_dynamic_lstm", "resnet_with_preprocess"
 ]


@@ -67,12 +68,12 @@ def parse_args():
        '--cpus',
        type=int,
        default=1,
-        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+        help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
    parser.add_argument(
        '--data_set',
        type=str,
        default='flowers',
-        choices=['cifar10', 'flowers'],
+        choices=['cifar10', 'flowers', 'imagenet'],
        help='Optional dataset for benchmark.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
@@ -122,6 +123,11 @@ def parse_args():
        type=str,
        default="",
        help='Directory that contains all the training recordio files.')
+    parser.add_argument(
+        '--test_data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the test data (NOT recordio).')
    parser.add_argument(
        '--use_inference_transpiler',
        action='store_true',
@@ -130,5 +136,9 @@ def parse_args():
        '--no_random',
        action='store_true',
        help='If set, keep the random seed and do not shuffle the data.')
+    parser.add_argument(
+        '--use_lars',
+        action='store_true',
+        help='If set, use lars for optimizers, ONLY support resnet module.')
    args = parser.parse_args()
    return args
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -16,6 +16,7 @@ import argparse
 import cProfile
 import time
 import os
+import traceback

 import numpy as np

@@ -27,7 +28,7 @@ import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
 from args import *


-def append_nccl2_prepare(trainer_id):
+def append_nccl2_prepare(trainer_id, startup_prog):
    if trainer_id >= 0:
        # append gen_nccl_id at the end of startup program
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
@@ -40,11 +41,11 @@ def append_nccl2_prepare(trainer_id):
        current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
        worker_endpoints.remove(current_endpoint)

-        nccl_id_var = fluid.default_startup_program().global_block().create_var(
+        nccl_id_var = startup_prog.global_block().create_var(
            name="NCCLID",
            persistable=True,
            type=fluid.core.VarDesc.VarType.RAW)
-        fluid.default_startup_program().global_block().append_op(
+        startup_prog.global_block().append_op(
            type="gen_nccl_id",
            inputs={},
            outputs={"NCCLID": nccl_id_var},
@@ -59,7 +60,7 @@ def append_nccl2_prepare(trainer_id):
                        "nccl-based dist train.")


-def dist_transpile(trainer_id, args):
+def dist_transpile(trainer_id, args, train_prog, startup_prog):
    if trainer_id < 0:
        return None, None

@@ -80,133 +81,69 @@ def dist_transpile(trainer_id, args):
    # the role, should be either PSERVER or TRAINER
    training_role = os.getenv("PADDLE_TRAINING_ROLE")

-    t = distribute_transpiler.DistributeTranspiler()
+    config = distribute_transpiler.DistributeTranspilerConfig()
+    config.slice_var_up = not args.no_split_var
+    t = distribute_transpiler.DistributeTranspiler(config=config)
    t.transpile(
        trainer_id,
+        # NOTE: *MUST* use train_prog, for we are using with guard to
+        # generate different program for train and test.
+        program=train_prog,
        pservers=pserver_endpoints,
        trainers=trainers,
        sync_mode=not args.async_mode)
    if training_role == "PSERVER":
        pserver_program = t.get_pserver_program(current_endpoint)
-        pserver_startup_program = t.get_startup_program(current_endpoint,
-                                                        pserver_program)
+        pserver_startup_program = t.get_startup_program(
+            current_endpoint, pserver_program, startup_program=startup_prog)
        return pserver_program, pserver_startup_program
    elif training_role == "TRAINER":
        train_program = t.get_trainer_program()
-        return train_program, fluid.default_startup_program()
+        return train_program, startup_prog
    else:
        raise ValueError(
            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
        )


-def test(exe, inference_program, test_reader, feeder, batch_acc):
-    accuracy_evaluator = fluid.metrics.Accuracy()
-    for batch_id, data in enumerate(test_reader()):
-        acc = exe.run(inference_program,
-                      feed=feeder.feed(data),
-                      fetch_list=[batch_acc])
-        accuracy_evaluator.update(value=np.array(acc), weight=len(data))
+def test_parallel(exe, test_args, args, test_prog, feeder):
+    acc_evaluators = []
+    for i in xrange(len(test_args[2])):
+        acc_evaluators.append(fluid.metrics.Accuracy())

-    return accuracy_evaluator.eval()
-
-
-# TODO(wuyi): replace train, train_parallel, test functions with new trainer
-# API once it is ready.
-def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
-          args, train_prog, startup_prog):
-    if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
-        place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
-        exe.run(train_prog)
-        return
-
-    if args.use_fake_data:
-        raise Exception(
-            "fake data is not supported in single GPU test for now.")
-
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(startup_prog)
-
-    # Use inference_transpiler to speedup
-    if not args.use_reader_op:
-        feed_var_list = [
-            var for var in train_prog.global_block().vars.itervalues()
-            if var.is_data
-        ]
-        feeder = fluid.DataFeeder(feed_var_list, place)
-
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        train_losses = []
-        if not args.use_reader_op:
-            reader_generator = train_reader()
-        batch_id = 0
-        data = None
+    to_fetch = [v.name for v in test_args[2]]
+    if args.use_reader_op:
+        test_args[4].start()
        while True:
-            if not args.use_reader_op:
-                data = next(reader_generator, None)
-                if data == None:
-                    break
-            if iters == args.iterations:
-                reader_generator.close()
+            try:
+                acc_rets = exe.run(fetch_list=to_fetch)
+                for i, e in enumerate(acc_evaluators):
+                    e.update(
+                        value=np.array(acc_rets[i]), weight=args.batch_size)
+            except fluid.core.EOFException as eof:
+                test_args[4].reset()
                break
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
+    else:
+        for batch_id, data in enumerate(test_args[3]()):
+            acc_rets = exe.run(feed=feeder.feed(data), fetch_list=to_fetch)
+            for i, e in enumerate(acc_evaluators):
+                e.update(value=np.array(acc_rets[i]), weight=len(data))

-            if args.use_reader_op:
-                try:
-                    loss = exe.run(train_prog, fetch_list=[avg_loss])
-                except fluid.core.EnforceNotMet as ex:
-                    break
-            else:
-                loss = exe.run(train_prog,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_loss])
-            iters += 1
-            batch_id += 1
-            # FIXME(wuyi): For use_reader_op, if the current
-            # pass is not the last, the last batch of this pass
-            # is also equal to args.batch_size.
-            if args.use_reader_op:
-                num_samples += args.batch_size * args.gpus
-            else:
-                num_samples += len(data)
-            train_losses.append(loss)
-            print("Pass: %d, Iter: %d, Loss: %f\n" %
-                  (pass_id, iters, np.mean(train_losses)))
-        print_train_time(start_time, time.time(), num_samples)
-        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
-        # evaluation
-        if not args.no_test and batch_acc and not args.use_reader_op:
-            if args.use_inference_transpiler:
-                t = fluid.InferenceTranspiler()
-                t.transpile(infer_prog, place)
-
-            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
-                                 batch_acc)
-            print(", Test Accuracy: %f" % pass_test_acc)
-        print("\n")
-        # TODO(wuyi): add warmup passes to get better perf data.
-        exit(0)
+    return [e.eval() for e in acc_evaluators]


-# TODO(wuyi): replace train, train_parallel, test functions with new trainer
-# API once it is ready.
-def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
-                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
-                   num_trainers, trainer_id):
+# NOTE: only need to benchmark using parallelexe
+def train_parallel(train_args, test_args, args, train_prog, test_prog,
+                   startup_prog, nccl_id_var, num_trainers, trainer_id):
+    over_all_start = time.time()
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    feeder = None
    if not args.use_reader_op:
        feed_var_list = [
            var for var in train_prog.global_block().vars.itervalues()
            if var.is_data
        ]
        feeder = fluid.DataFeeder(feed_var_list, place)
-
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
@@ -230,63 +167,110 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
    startup_exe = fluid.Executor(place)
    startup_exe.run(startup_prog)
    strategy = fluid.ExecutionStrategy()
-    strategy.num_threads = 1
+    strategy.num_threads = args.cpus
    strategy.allow_op_delay = False
+    avg_loss = train_args[0]
+
+    if args.update_method == "pserver":
+        # parameter server mode distributed training, merge
+        # gradients on local server, do not initialize
+        # ParallelExecutor with multi server all-reduce mode.
+        num_trainers = 1
+        trainer_id = 0
+
    exe = fluid.ParallelExecutor(
        True,
        avg_loss.name,
+        main_program=train_prog,
        exec_strategy=strategy,
        num_trainers=num_trainers,
        trainer_id=trainer_id)

+    if not args.no_test:
+        if args.update_method == "pserver":
+            test_scope = None
+        else:
+            # NOTE: use an empty scope to avoid test exe using NCCLID
+            test_scope = fluid.Scope()
+        test_exe = fluid.ParallelExecutor(
+            True, main_program=test_prog, share_vars_from=exe)
+
    for pass_id in range(args.pass_num):
        num_samples = 0
        iters = 0
        start_time = time.time()
        if not args.use_reader_op:
-            reader_generator = train_reader()
+            reader_generator = train_args[3]()  #train_reader
        batch_id = 0
        data = None
+        if args.use_reader_op:
+            train_args[4].start()
        while True:
            if not args.use_reader_op:
                data = next(reader_generator, None)
                if data == None:
                    break
+            if args.profile and batch_id == 5:
+                profiler.start_profiler("All")
+                profiler.reset_profiler()
+            elif args.profile and batch_id == 10:
+                print("profiling total time: ", time.time() - start_time)
+                profiler.stop_profiler("total", "/tmp/profile_%d_pass%d" %
+                                       (trainer_id, pass_id))
            if iters == args.iterations:
                reader_generator.close()
                break
-            if args.profile and pass_id == 0 and batch_id == 5:
-                profiler.start_profiler("All")
-            elif args.profile and pass_id == 0 and batch_id == 10:
-                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)

            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
+            fetch_list = [avg_loss.name]
+            acc_name_list = [v.name for v in train_args[2]]
+            fetch_list.extend(acc_name_list)
+
            if args.use_fake_data or args.use_reader_op:
                try:
-                    loss, = exe.run([avg_loss.name])
+
+                    fetch_ret = exe.run(fetch_list)
+                except fluid.core.EOFException as eof:
+                    break
                except fluid.core.EnforceNotMet as ex:
+                    traceback.print_exc()
                    break
            else:
-                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+                fetch_ret = exe.run(fetch_list, feed=feeder.feed(data))
            if args.use_reader_op:
                num_samples += args.batch_size * args.gpus
            else:
                num_samples += len(data)
+
            iters += 1
            if batch_id % 1 == 0:
-                print("Pass %d, batch %d, loss %s" %
-                      (pass_id, batch_id, np.array(loss)))
+                fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
+                print("Pass %d, batch %d, loss %s, accucacys: %s" %
+                      (pass_id, batch_id, fetched_data[0], fetched_data[1:]))
            batch_id += 1

        print_train_time(start_time, time.time(), num_samples)
-        if not args.no_test and batch_acc and not args.use_reader_op:
-            # we have not implement record io for test
-            # skip test when use args.use_reader_op
-            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
-                            batch_acc)
-            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
+        if args.use_reader_op:
+            train_args[4].reset()  # reset reader handle
+        else:
+            del reader_generator
+
+        if not args.no_test and test_args[2]:
+            test_feeder = None
+            if not args.use_reader_op:
+                test_feed_var_list = [
+                    var for var in test_prog.global_block().vars.itervalues()
+                    if var.is_data
+                ]
+                test_feeder = fluid.DataFeeder(test_feed_var_list, place)
+            test_ret = test_parallel(test_exe, test_args, args, test_prog,
+                                     test_feeder)
+            print("Pass: %d, Test Accuracy: %s\n" %
+                  (pass_id, [np.mean(np.array(v)) for v in test_ret]))
+
+    print("total train time: ", time.time() - over_all_start)


 def print_arguments(args):
@@ -328,44 +312,46 @@ def main():
    if args.use_cprof:
        pr = cProfile.Profile()
        pr.enable()
+
    model_def = __import__("models.%s" % args.model, fromlist=["models"])
-    train_args = list(model_def.get_model(args))
-    train_args.append(args)
-    # Run optimizer.minimize(avg_loss)
-    train_args[2].minimize(train_args[0])
-    if args.memory_optimize:
-        fluid.memory_optimize(fluid.default_main_program())
+
+    train_prog = fluid.Program()
+    test_prog = fluid.Program()
+    startup_prog = fluid.Program()
+
+    train_args = list(model_def.get_model(args, True, train_prog, startup_prog))
+    test_args = list(model_def.get_model(args, False, test_prog, startup_prog))
+
+    all_args = [train_args, test_args, args]

    if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile(trainer_id, args)
+        train_prog, startup_prog = dist_transpile(trainer_id, args, train_prog,
+                                                  startup_prog)
        if not train_prog:
            raise Exception(
                "Must configure correct environments to run dist train.")
-        train_args.extend([train_prog, startup_prog])
+        all_args.extend([train_prog, test_prog, startup_prog])
        if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
-            train_args.extend([nccl_id_var, num_trainers, trainer_id])
-            train_parallel(*train_args)
-        train(*train_args)
+            all_args.extend([nccl_id_var, num_trainers, trainer_id])
+            train_parallel(*all_args)
+        elif os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
+            # start pserver with Executor
+            server_exe = fluid.Executor(fluid.CPUPlace())
+            server_exe.run(startup_prog)
+            server_exe.run(train_prog)
        exit(0)

    # for other update methods, use default programs
-    train_args.append(fluid.default_main_program())
-    train_args.append(fluid.default_startup_program())
+    all_args.extend([train_prog, test_prog, startup_prog])

    if args.update_method == "nccl2":
-        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
-    if args.gpus == 1:
-        # NOTE: parallel executor use profiler interanlly
-        if args.use_nvprof and args.device == 'GPU':
-            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
-                train(*train_args)
-        else:
-            train(*train_args)
-    else:
-        if args.device == "CPU":
-            raise Exception("Only support GPU perf with parallel exe")
-        train_args.extend([nccl_id_var, num_trainers, trainer_id])
-        train_parallel(*train_args)
+        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(
+            trainer_id, startup_prog)
+
+    if args.device == "CPU":
+        raise Exception("Only support GPU perf with parallel exe")
+    all_args.extend([nccl_id_var, num_trainers, trainer_id])
+    train_parallel(*all_args)


 if __name__ == "__main__":

--- a/benchmark/fluid/imagenet_reader.py
+++ b/benchmark/fluid/imagenet_reader.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import random
+import functools
+import numpy as np
+from threading import Thread
+import subprocess
+import time
+
+from Queue import Queue
+import paddle
+from PIL import Image, ImageEnhance
+
+random.seed(0)
+
+DATA_DIM = 224
+
+THREAD = int(os.getenv("PREPROCESS_THREADS", "10"))
+BUF_SIZE = 5120
+
+DATA_DIR = '/mnt/ImageNet'
+TRAIN_LIST = '/mnt/ImageNet/train.txt'
+TEST_LIST = '/mnt/ImageNet/val.txt'
+
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+
+
+def resize_short(img, target_size):
+    percent = float(target_size) / min(img.size[0], img.size[1])
+    resized_width = int(round(img.size[0] * percent))
+    resized_height = int(round(img.size[1] * percent))
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    return img
+
+
+def crop_image(img, target_size, center):
+    width, height = img.size
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = random.randint(0, width - size)
+        h_start = random.randint(0, height - size)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img.crop((w_start, h_start, w_end, h_end))
+    return img
+
+
+def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
+    aspect_ratio = math.sqrt(random.uniform(*ratio))
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+
+    bound = min((float(img.size[0]) / img.size[1]) / (w**2),
+                (float(img.size[1]) / img.size[0]) / (h**2))
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+
+    target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
+                                                             scale_max)
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+
+    i = random.randint(0, img.size[0] - w)
+    j = random.randint(0, img.size[1] - h)
+
+    img = img.crop((i, j, i + w, j + h))
+    img = img.resize((size, size), Image.LANCZOS)
+    return img
+
+
+def rotate_image(img):
+    angle = random.randint(-10, 10)
+    img = img.rotate(angle)
+    return img
+
+
+def distort_color(img):
+    def random_brightness(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Brightness(img).enhance(e)
+
+    def random_contrast(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Contrast(img).enhance(e)
+
+    def random_color(img, lower=0.5, upper=1.5):
+        e = random.uniform(lower, upper)
+        return ImageEnhance.Color(img).enhance(e)
+
+    ops = [random_brightness, random_contrast, random_color]
+    random.shuffle(ops)
+
+    img = ops[0](img)
+    img = ops[1](img)
+    img = ops[2](img)
+
+    return img
+
+
+def process_image(sample, mode, color_jitter, rotate):
+    img_path = sample[0]
+
+    img = Image.open(img_path)
+    if mode == 'train':
+        if rotate: img = rotate_image(img)
+        img = random_crop(img, DATA_DIM)
+    else:
+        img = resize_short(img, target_size=256)
+        img = crop_image(img, target_size=DATA_DIM, center=True)
+    if mode == 'train':
+        if color_jitter:
+            img = distort_color(img)
+        if random.randint(0, 1) == 1:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
+    img -= img_mean
+    img /= img_std
+
+    if mode == 'train' or mode == 'val':
+        return img, sample[1]
+    elif mode == 'test':
+        return [img]
+
+
+class XmapEndSignal():
+    pass
+
+
+def xmap_readers(mapper,
+                 reader,
+                 process_num,
+                 buffer_size,
+                 order=False,
+                 print_queue_state=True):
+    end = XmapEndSignal()
+
+    # define a worker to read samples from reader to in_queue
+    def read_worker(reader, in_queue):
+        for i in reader():
+            in_queue.put(i)
+        in_queue.put(end)
+
+    # define a worker to read samples from reader to in_queue with order flag
+    def order_read_worker(reader, in_queue, file_queue):
+        in_order = 0
+        for i in reader():
+            in_queue.put((in_order, i))
+            in_order += 1
+        in_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue
+    def handle_worker(in_queue, out_queue, mapper):
+        sample = in_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            r = mapper(sample)
+            out_queue.put(r)
+            sample = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue by order
+    def order_handle_worker(in_queue, out_queue, mapper, out_order):
+        ins = in_queue.get()
+        while not isinstance(ins, XmapEndSignal):
+            order, sample = ins
+            r = mapper(sample)
+            while order != out_order[0]:
+                pass
+            out_queue.put(r)
+            out_order[0] += 1
+            ins = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    def xreader():
+        file_queue = Queue()
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+
+        sample = out_queue.get()
+        start_t = time.time()
+        while not isinstance(sample, XmapEndSignal):
+            yield sample
+            sample = out_queue.get()
+            if time.time() - start_t > 3:
+                if print_queue_state:
+                    print("queue sizes: ", in_queue.qsize(), out_queue.qsize())
+                start_t = time.time()
+        finish = 1
+        while finish < process_num:
+            sample = out_queue.get()
+            if isinstance(sample, XmapEndSignal):
+                finish += 1
+            else:
+                yield sample
+
+    return xreader
+
+
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False,
+                    xmap=True):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(full_lines)
+            if mode == 'train':
+                trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+                trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+                per_node_lines = len(full_lines) / trainer_count
+                lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
+                                   * per_node_lines]
+                print(
+                    "read images from %d, length: %d, lines length: %d, total: %d"
+                    % (trainer_id * per_node_lines, per_node_lines, len(lines),
+                       len(full_lines)))
+            else:
+                lines = full_lines
+
+            for line in lines:
+                if mode == 'train':
+                    img_path, label = line.split()
+                    img_path = img_path.replace("JPEG", "jpeg")
+                    img_path = os.path.join(DATA_DIR, "train", img_path)
+                    yield (img_path, int(label))
+                elif mode == 'val':
+                    img_path, label = line.split()
+                    img_path = img_path.replace("JPEG", "jpeg")
+                    img_path = os.path.join(DATA_DIR, "val", img_path)
+                    yield (img_path, int(label))
+                elif mode == 'test':
+                    img_path = os.path.join(DATA_DIR, line)
+                    yield [img_path]
+
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+
+    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
+
+
+def load_raw_image_uint8(sample):
+    img_arr = np.array(Image.open(sample[0])).astype('int64')
+    return img_arr, int(sample[1])
+
+
+def train_raw(file_list=TRAIN_LIST, shuffle=True):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(full_lines)
+
+            trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+            trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+            per_node_lines = len(full_lines) / trainer_count
+            lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1) *
+                               per_node_lines]
+            print("read images from %d, length: %d, lines length: %d, total: %d"
+                  % (trainer_id * per_node_lines, per_node_lines, len(lines),
+                     len(full_lines)))
+
+            for line in lines:
+                img_path, label = line.split()
+                img_path = img_path.replace("JPEG", "jpeg")
+                img_path = os.path.join(DATA_DIR, "train", img_path)
+                yield (img_path, int(label))
+
+    return paddle.reader.xmap_readers(load_raw_image_uint8, reader, THREAD,
+                                      BUF_SIZE)
+
+
+def train(file_list=TRAIN_LIST, xmap=True):
+    return _reader_creator(
+        file_list,
+        'train',
+        shuffle=True,
+        color_jitter=False,
+        rotate=False,
+        xmap=xmap)
+
+
+def val(file_list=TEST_LIST, xmap=True):
+    return _reader_creator(file_list, 'val', shuffle=False, xmap=xmap)
+
+
+def test(file_list=TEST_LIST):
+    return _reader_creator(file_list, 'test', shuffle=False)
+
+
+if __name__ == "__main__":
+    c = 0
+    start_t = time.time()
+    for d in train()():
+        c += 1
+        if c >= 10000:
+            break
+    spent = time.time() - start_t
+    print("read 10000 speed: ", 10000 / spent, spent)
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -163,6 +163,19 @@ def gen_job():
        volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
        volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})

+    # add ceph volumes
+    volumes.append({
+        "name": "ceph-data",
+        "cephfs": {
+            "monitors": ["192.168.16.23:6789"],
+            "secretRef": {
+                "name": "ceph-secret"
+            },
+            "user": "admin",
+        }
+    })
+    volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
+
    tn["spec"]["template"]["spec"]["volumes"] = volumes
    tn_container["volumeMounts"] = volumeMounts


--- a/benchmark/fluid/models/__init__.py
+++ b/benchmark/fluid/models/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.

 __all__ = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
+    "resnet_with_preprocess"
 ]
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """seq2seq model for fluid."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -181,7 +182,7 @@ def lodtensor_to_ndarray(lod_tensor):
    return ndarray


-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
    if args.use_reader_op:
        raise Exception("machine_translation do not support reader op for now.")
    embedding_dim = 512
@@ -190,30 +191,27 @@ def get_model(args):
    dict_size = 30000
    beam_size = 3
    max_length = 250
-    avg_cost, feeding_list = seq_to_seq_net(
-        embedding_dim,
-        encoder_size,
-        decoder_size,
-        dict_size,
-        dict_size,
-        False,
-        beam_size=beam_size,
-        max_length=max_length)
-
-    # clone from default main program
-    inference_program = fluid.default_main_program().clone()
-
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-
-    train_batch_generator = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=args.batch_size * args.gpus)

-    test_batch_generator = paddle.batch(
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            avg_cost, feeding_list = seq_to_seq_net(
+                embedding_dim,
+                encoder_size,
+                decoder_size,
+                dict_size,
+                dict_size,
+                False,
+                beam_size=beam_size,
+                max_length=max_length)
+    if is_train:
+        optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+        optimizer.minimize(avg_cost)
+
+    batch_generator = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.wmt14.test(dict_size), buf_size=1000),
-        batch_size=args.batch_size)
+            paddle.dataset.wmt14.train(dict_size)
+            if is_train else paddle.dataset.wmt14.test(dict_size),
+            buf_size=1000),
+        batch_size=args.batch_size * args.gpus)

-    return avg_cost, inference_program, optimizer, train_batch_generator, \
-           test_batch_generator, None
+    return avg_cost, optimizer, [], batch_generator, None
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -65,61 +65,50 @@ def cnn_model(data):
    return predict


-def get_model(args):
-    if args.use_reader_op:
-        filelist = [
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-        ]
-        data_file = fluid.layers.open_files(
-            filenames=filelist,
-            shapes=[[-1, 1, 28, 28], (-1, 1)],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        images, label = fluid.layers.read_file(data_file)
-    else:
-        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if args.device == 'CPU' and args.cpus > 1:
-        places = fluid.layers.get_places(args.cpus)
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            predict = cnn_model(pd.read_input(images))
-            label = pd.read_input(label)
+def get_model(args, is_train, main_prog, startup_prog):
+    # NOTE: mnist is small, we don't implement data sharding yet.
+    filelist = [
+        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+    ]
+    with fluid.program_guard(main_prog, startup_prog):
+        if args.use_reader_op:
+            data_file_handle = fluid.layers.open_files(
+                filenames=filelist,
+                shapes=[[-1, 1, 28, 28], (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"],
+                thread_num=1,
+                pass_num=1)
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(
+                    data_file_handle, batch_size=args.batch_size))
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                input, label = fluid.layers.read_file(data_file)
+            else:
+                images = fluid.layers.data(
+                    name='pixel', shape=[1, 28, 28], dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            predict = cnn_model(images)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
+            # Evaluator
            batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-            pd.write_output(avg_cost)
-            pd.write_output(batch_acc)
-
-        avg_cost, batch_acc = pd()
-        avg_cost = fluid.layers.mean(avg_cost)
-        batch_acc = fluid.layers.mean(batch_acc)
-    else:
-        # Train program
-        predict = cnn_model(images)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        # Evaluator
-        batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-
-    # Optimization
-    opt = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.001, beta1=0.9, beta2=0.999)
+            # Optimization
+            if is_train:
+                opt = fluid.optimizer.AdamOptimizer(
+                    learning_rate=0.001, beta1=0.9, beta2=0.999)
+                opt.minimize()
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)

    # Reader
-    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=args.batch_size)
-    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
+    if is_train:
+        reader = paddle.dataset.mnist.train()
+    else:
+        reader = paddle.dataset.mnist.test()
+    batched_reader = paddle.batch(
+        reader, batch_size=args.batch_size * args.gpus)
+    return avg_cost, opt, [batch_acc], batched_reader, data_file_handle
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -27,10 +27,17 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
-from recordio_converter import imagenet_train, imagenet_test
+# from recordio_converter import imagenet_train, imagenet_test
+from imagenet_reader import train, val


-def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  act='relu',
+                  is_train=True):
    conv1 = fluid.layers.conv2d(
        input=input,
        filter_size=filter_size,
@@ -39,29 +46,31 @@ def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
        padding=padding,
        act=None,
        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv1, act=act)
+    return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)


-def shortcut(input, ch_out, stride):
+def shortcut(input, ch_out, stride, is_train=True):
    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
    if ch_in != ch_out:
-        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        return conv_bn_layer(
+            input, ch_out, 1, stride, 0, None, is_train=is_train)
    else:
        return input


-def basicblock(input, ch_out, stride):
-    short = shortcut(input, ch_out, stride)
-    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+def basicblock(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')


-def bottleneck(input, ch_out, stride):
-    short = shortcut(input, ch_out * 4, stride)
-    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
-    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+def bottleneck(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out * 4, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
+    conv3 = conv_bn_layer(
+        conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')


@@ -72,7 +81,11 @@ def layer_warp(block_func, input, ch_out, count, stride):
    return res_out


-def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+def resnet_imagenet(input,
+                    class_dim,
+                    depth=50,
+                    data_format='NCHW',
+                    is_train=True):

    cfg = {
        18: ([2, 2, 2, 1], basicblock),
@@ -115,8 +128,9 @@ def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
    return out


-def get_model(args):
+def _model_reader_dshape_classdim(args, is_train):
    model = resnet_cifar10
+    reader = None
    if args.data_set == "cifar10":
        class_dim = 10
        if args.data_format == 'NCHW':
@@ -124,8 +138,10 @@ def get_model(args):
        else:
            dshape = [32, 32, 3]
        model = resnet_cifar10
-        train_reader = paddle.dataset.cifar.train10()
-        test_reader = paddle.dataset.cifar.test10()
+        if is_train:
+            reader = paddle.dataset.cifar.train10()
+        else:
+            reader = paddle.dataset.cifar.test10()
    elif args.data_set == "flowers":
        class_dim = 102
        if args.data_format == 'NCHW':
@@ -133,8 +149,10 @@ def get_model(args):
        else:
            dshape = [224, 224, 3]
        model = resnet_imagenet
-        train_reader = paddle.dataset.flowers.train()
-        test_reader = paddle.dataset.flowers.test()
+        if is_train:
+            reader = paddle.dataset.flowers.train()
+        else:
+            reader = paddle.dataset.flowers.test()
    elif args.data_set == "imagenet":
        class_dim = 1000
        if args.data_format == 'NCHW':
@@ -145,64 +163,89 @@ def get_model(args):
        if not args.data_path:
            raise Exception(
                "Must specify --data_path when training with imagenet")
-        train_reader = imagenet_train(args.data_path)
-        test_reader = imagenet_test(args.data_path)
-
-    if args.use_reader_op:
-        filelist = [
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-        ]
-        data_file = fluid.layers.open_files(
-            filenames=filelist,
-            shapes=[[-1] + dshape, (-1, 1)],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        input, label = fluid.layers.read_file(data_file)
-    else:
-        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if args.device == 'CPU' and args.cpus > 1:
-        places = fluid.layers.get_places(args.cpus)
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            predict = model(pd.read_input(input), class_dim)
-            label = pd.read_input(label)
+        if not args.use_reader_op:
+            if is_train:
+                reader = train()
+            else:
+                reader = val()
+        else:
+            if is_train:
+                reader = train(xmap=False)
+            else:
+                reader = val(xmap=False)
+    return model, reader, dshape, class_dim
+
+
+def get_model(args, is_train, main_prog, startup_prog):
+    model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
+                                                                     is_train)
+
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=args.batch_size * args.gpus,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('float32', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            predict = model(input, class_dim, is_train=is_train)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
-            batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-            pd.write_output(avg_cost)
-            pd.write_output(batch_acc)

-        avg_cost, batch_acc = pd()
-        avg_cost = fluid.layers.mean(avg_cost)
-        batch_acc = fluid.layers.mean(batch_acc)
+            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
+            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
+
+            # configure optimize
+            optimizer = None
+            if is_train:
+                if args.use_lars:
+                    lars_decay = 1.0
+                else:
+                    lars_decay = 0.0
+
+                total_images = 1281167 / trainer_count
+
+                step = int(total_images / args.batch_size + 1)
+                epochs = [30, 60, 80, 90]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    learning_rate=base_lr,
+                    #learning_rate=fluid.layers.piecewise_decay(
+                    #    boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+
+    # config readers
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader if args.no_random else paddle.reader.shuffle(
+                reader, buf_size=5120),
+            batch_size=args.batch_size * args.gpus,
+            drop_last=True)
    else:
-        predict = model(input, class_dim)
-        cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-        batch_acc = fluid.layers.accuracy(input=predict, label=label)
-
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc])
-
-    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-
-    batched_train_reader = paddle.batch(
-        train_reader if args.no_random else paddle.reader.shuffle(
-            train_reader, buf_size=5120),
-        batch_size=args.batch_size * args.gpus,
-        drop_last=True)
-    batched_test_reader = paddle.batch(
-        test_reader, batch_size=args.batch_size, drop_last=True)
-
-    return avg_cost, inference_program, optimizer, batched_train_reader,\
-                   batched_test_reader, batch_acc
+        batched_reader = None
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                reader if args.no_random else paddle.reader.shuffle(
+                    reader, buf_size=5120),
+                batch_size=args.batch_size))
+
+    return avg_cost, optimizer, [batch_acc1,
+                                 batch_acc5], batched_reader, pyreader
--- a/benchmark/fluid/models/resnet_with_preprocess.py
+++ b/benchmark/fluid/models/resnet_with_preprocess.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import numpy as np
+import time
+import os
+
+import cProfile, pstats, StringIO
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+# from recordio_converter import imagenet_train, imagenet_test
+from imagenet_reader import train_raw, val
+
+
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  act='relu',
+                  is_train=True):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
+
+
+def shortcut(input, ch_out, stride, is_train=True):
+    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(
+            input, ch_out, 1, stride, 0, None, is_train=is_train)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride, is_train=True):
+    short = shortcut(input, ch_out * 4, stride, is_train=is_train)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
+    conv3 = conv_bn_layer(
+        conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input,
+                    class_dim,
+                    depth=50,
+                    data_format='NCHW',
+                    is_train=True):
+
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+
+
+def _model_reader_dshape_classdim(args, is_train):
+    model = resnet_cifar10
+    reader = None
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+        model = resnet_cifar10
+        if is_train:
+            reader = paddle.dataset.cifar.train10()
+        else:
+            reader = paddle.dataset.cifar.test10()
+    elif args.data_set == "flowers":
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if is_train:
+            reader = paddle.dataset.flowers.train()
+        else:
+            reader = paddle.dataset.flowers.test()
+    elif args.data_set == "imagenet":
+        class_dim = 1000
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if not args.data_path:
+            raise Exception(
+                "Must specify --data_path when training with imagenet")
+        if not args.use_reader_op:
+            if is_train:
+                reader = train_raw()
+            else:
+                reader = val()
+        else:
+            if is_train:
+                reader = train_raw()
+            else:
+                reader = val(xmap=False)
+    return model, reader, dshape, class_dim
+
+
+def get_model(args, is_train, main_prog, startup_prog):
+    model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
+                                                                     is_train)
+
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=args.batch_size * args.gpus,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('uint8', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='uint8')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            # add imagenet preprocessors
+            random_crop = fluid.layers.random_crop(input, dshape)
+            casted = fluid.layers.cast(random_crop, 'float32')
+            # input is HWC
+            trans = fluid.layers.transpose(casted, [0, 3, 1, 2]) / 255.0
+            img_mean = fluid.layers.tensor.assign(
+                np.array([0.485, 0.456, 0.406]).astype('float32').reshape((3, 1,
+                                                                           1)))
+            img_std = fluid.layers.tensor.assign(
+                np.array([0.229, 0.224, 0.225]).astype('float32').reshape((3, 1,
+                                                                           1)))
+            h1 = fluid.layers.elementwise_sub(trans, img_mean, axis=1)
+            h2 = fluid.layers.elementwise_div(h1, img_std, axis=1)
+
+            # pre_out = (trans - img_mean) / img_std
+
+            predict = model(h2, class_dim, is_train=is_train)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+
+            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
+            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
+
+            # configure optimize
+            optimizer = None
+            if is_train:
+                if args.use_lars:
+                    lars_decay = 1.0
+                else:
+                    lars_decay = 0.0
+
+                total_images = 1281167 / trainer_count
+
+                step = int(total_images / args.batch_size + 1)
+                epochs = [30, 60, 80, 90]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    learning_rate=base_lr,
+                    #learning_rate=fluid.layers.piecewise_decay(
+                    #    boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+                optimizer.minimize(avg_cost)
+
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+
+    # config readers
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader if args.no_random else paddle.reader.shuffle(
+                reader, buf_size=5120),
+            batch_size=args.batch_size * args.gpus,
+            drop_last=True)
+    else:
+        batched_reader = None
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                # reader if args.no_random else paddle.reader.shuffle(
+                #     reader, buf_size=5120),
+                reader,
+                batch_size=args.batch_size))
+
+    return avg_cost, optimizer, [batch_acc1,
+                                 batch_acc5], batched_reader, pyreader
--- a/benchmark/fluid/models/se_resnext.py
+++ b/benchmark/fluid/models/se_resnext.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import math
+import os
+from imagenet_reader import train, val
+
+__all__ = [
+    "SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d",
+    "SE_ResNeXt152_32x4d", "get_model"
+]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class SE_ResNeXt():
+    def __init__(self, layers=50, is_train=True):
+        self.params = train_parameters
+        self.layers = layers
+        self.is_train = is_train
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu')
+            conv = self.conv_bn_layer(
+                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(input=drop,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(input, ch_out, filter_size, stride)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, cardinality,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio)
+
+        short = self.shortcut(input, num_filters * 2, stride)
+
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(
+            input=conv, act=act, is_test=not self.is_train)
+
+    def squeeze_excitation(self, input, num_channels, reduction_ratio):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(input=pool,
+                                  size=num_channels / reduction_ratio,
+                                  act='relu',
+                                  param_attr=fluid.param_attr.ParamAttr(
+                                      initializer=fluid.initializer.Uniform(
+                                          -stdv, stdv)))
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(input=squeeze,
+                                     size=num_channels,
+                                     act='sigmoid',
+                                     param_attr=fluid.param_attr.ParamAttr(
+                                         initializer=fluid.initializer.Uniform(
+                                             -stdv, stdv)))
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+
+
+def SE_ResNeXt50_32x4d():
+    model = SE_ResNeXt(layers=50)
+    return model
+
+
+def SE_ResNeXt101_32x4d():
+    model = SE_ResNeXt(layers=101)
+    return model
+
+
+def SE_ResNeXt152_32x4d():
+    model = SE_ResNeXt(layers=152)
+    return model
+
+
+def get_model(args, is_train, main_prog, startup_prog):
+    model = SE_ResNeXt(layers=50)
+    batched_reader = None
+    pyreader = None
+    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    dshape = train_parameters["input_size"]
+
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                pyreader = fluid.layers.py_reader(
+                    capacity=10,
+                    shapes=([-1] + dshape, (-1, 1)),
+                    dtypes=('float32', 'int64'),
+                    name="train_reader" if is_train else "test_reader",
+                    use_double_buffer=True)
+                input, label = fluid.layers.read_file(pyreader)
+            else:
+                input = fluid.layers.data(
+                    name='data', shape=dshape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+
+            out = model.net(input=input)
+            cost = fluid.layers.cross_entropy(input=out, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+            acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+            optimizer = None
+            if is_train:
+                if args.use_lars:
+                    lars_decay = 1.0
+                else:
+                    lars_decay = 0.0
+
+                total_images = 1281167 / trainer_count
+
+                step = int(total_images / args.batch_size + 1)
+                epochs = [40, 80, 100]
+                bd = [step * e for e in epochs]
+                base_lr = args.learning_rate
+                lr = []
+                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+                optimizer = fluid.optimizer.Momentum(
+                    # learning_rate=base_lr,
+                    learning_rate=fluid.layers.piecewise_decay(
+                        boundaries=bd, values=lr),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4),
+                    LARS_weight_decay=lars_decay)
+                optimizer.minimize(avg_cost)
+
+                if args.memory_optimize:
+                    fluid.memory_optimize(main_prog)
+
+    # config readers
+    if is_train:
+        reader = train()
+    else:
+        reader = val()
+
+    if not args.use_reader_op:
+        batched_reader = paddle.batch(
+            reader, batch_size=args.batch_size * args.gpus, drop_last=True)
+    else:
+        pyreader.decorate_paddle_reader(
+            paddle.batch(
+                reader, batch_size=args.batch_size))
+
+    return avg_cost, optimizer, [acc_top1, acc_top5], batched_reader, pyreader
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -26,7 +26,6 @@ import numpy
 import paddle
 import paddle.dataset.imdb as imdb
 import paddle.fluid as fluid
-import paddle.batch as batch
 import paddle.fluid.profiler as profiler

 word_dict = imdb.word_dict()
@@ -43,19 +42,7 @@ def crop_sentence(reader, crop_size):
    return __impl__


-def get_model(args):
-    if args.use_reader_op:
-        raise Exception(
-            "stacked_dynamic_lstm do not support reader op for now.")
-    lstm_size = 512
-    emb_dim = 512
-    crop_size = 1500
-
-    data = fluid.layers.data(
-        name="words", shape=[1], lod_level=1, dtype='int64')
-    sentence = fluid.layers.embedding(
-        input=data, size=[len(word_dict), emb_dim])
-
+def lstm_net(sentence, lstm_size):
    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')

    rnn = fluid.layers.DynamicRNN()
@@ -97,31 +84,47 @@ def get_model(args):

    last = fluid.layers.sequence_pool(rnn(), 'last')
    logit = fluid.layers.fc(input=last, size=2, act='softmax')
-    loss = fluid.layers.cross_entropy(
-        input=logit,
-        label=fluid.layers.data(
-            name='label', shape=[1], dtype='int64'))
-    loss = fluid.layers.mean(x=loss)
+    return logit

-    # add acc
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
-                shape=[1], dtype='int64'), total=batch_size_tensor)

-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-
-    adam = fluid.optimizer.Adam()
+def get_model(args, is_train, main_prog, startup_prog):
+    if args.use_reader_op:
+        raise Exception(
+            "stacked_dynamic_lstm do not support reader op for now.")
+    lstm_size = 512
+    emb_dim = 512
+    crop_size = 1500

-    train_reader = batch(
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            data = fluid.layers.data(
+                name="words", shape=[1], lod_level=1, dtype='int64')
+            sentence = fluid.layers.embedding(
+                input=data, size=[len(word_dict), emb_dim])
+            logit = lstm_net(sentence, lstm_size)
+            loss = fluid.layers.cross_entropy(
+                input=logit,
+                label=fluid.layers.data(
+                    name='label', shape=[1], dtype='int64'))
+            loss = fluid.layers.mean(x=loss)
+
+            # add acc
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                        shape=[1], dtype='int64'), total=batch_size_tensor)
+
+            if is_train:
+                adam = fluid.optimizer.Adam()
+                adam.minimize(loss)
+
+    if is_train:
+        reader = crop_sentence(imdb.train(word_dict), crop_size)
+    else:
+        reader = crop_sentence(imdb.test(word_dict), crop_size)
+
+    batched_reader = paddle.batch(
        paddle.reader.shuffle(
-            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
+            reader, buf_size=25000),
        batch_size=args.batch_size * args.gpus)
-    test_reader = batch(
-        paddle.reader.shuffle(
-            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
-        batch_size=args.batch_size)

-    return loss, inference_program, adam, train_reader, test_reader, batch_acc
+    return loss, adam, [batch_acc], batched_reader, None
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -25,7 +25,7 @@ import functools
 import os


-def vgg16_bn_drop(input):
+def vgg16_bn_drop(input, is_train=True):
    def conv_block(input, num_filter, groups, dropouts):
        return fluid.nets.img_conv_group(
            input=input,
@@ -46,13 +46,13 @@ def vgg16_bn_drop(input):

    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    bn = fluid.layers.batch_norm(input=fc1, act='relu', is_test=not is_train)
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
    return fc2


-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
    if args.data_set == "cifar10":
        classdim = 10
        if args.data_format == 'NCHW':
@@ -65,57 +65,56 @@ def get_model(args):
            data_shape = [3, 224, 224]
        else:
            data_shape = [224, 224, 3]
+    filelist = [
+        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+    ]
+    with fluid.program_guard(main_prog, startup_prog):
+        if args.use_reader_op:
+            data_file_handle = fluid.layers.open_files(
+                filenames=filelist,
+                shapes=[[-1] + data_shape, (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"],
+                thread_num=1,
+                pass_num=1)
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(
+                    data_file_handle, batch_size=args.batch_size))
+        with fluid.unique_name.guard():
+            if args.use_reader_op:
+                images, label = fluid.layers.read_file(data_file)
+            else:
+                images = fluid.layers.data(
+                    name='data', shape=data_shape, dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+            # Train program
+            net = vgg16_bn_drop(images, is_train=is_train)
+            predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)

-    if args.use_reader_op:
-        filelist = [
-            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-        ]
-        data_file = fluid.layers.open_files(
-            filenames=filelist,
-            shapes=[[-1] + data_shape, (-1, 1)],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"],
-            thread_num=args.gpus,
-            pass_num=args.pass_num)
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        images, label = fluid.layers.read_file(data_file)
-    else:
-        images = fluid.layers.data(
-            name='data', shape=data_shape, dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+            # Evaluator
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(
+                input=predict, label=label, total=batch_size_tensor)
+            # Optimization
+            if is_train:
+                optimizer = fluid.optimizer.Adam(
+                    learning_rate=args.learning_rate)
+                optimizer.minimize(avg_cost)

    # data reader
-    train_reader = paddle.batch(
+    if is_train:
+        reader = paddle.dataset.cifar.train10() \
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train()
+    else:
+        reader = paddle.dataset.cifar.test10() \
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test()
+
+    batched_reader = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
+            reader, buf_size=5120),
        batch_size=args.batch_size * args.gpus)
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=args.batch_size)

-    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
+    return avg_cost, optimizer, [batch_acc], batched_reader, data_file_handle
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -16,16 +16,6 @@ set(ANAKIN_LIBRARY     ${ANAKIN_INSTALL_DIR})
 set(ANAKIN_SHARED_LIB  ${ANAKIN_LIBRARY}/libanakin.so)
 set(ANAKIN_SABER_LIB   ${ANAKIN_LIBRARY}/libanakin_saber_common.so)

-# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
-set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
-set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
-set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
-set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
-
 include_directories(${ANAKIN_INCLUDE})
 include_directories(${ANAKIN_INCLUDE}/saber/)
 include_directories(${ANAKIN_INCLUDE}/saber/core/)
@@ -48,21 +38,24 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
    -Wno-reorder
    -Wno-error=cpp)

+if(WITH_GPU)
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=YES -DCUDNN_ROOT=${CUDNN_ROOT} -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR})
+else()
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=NO)
+endif()
 ExternalProject_Add(
    extern_anakin
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLML_PROJECT}
    GIT_REPOSITORY      "https://github.com/PaddlePaddle/Anakin"
-    GIT_TAG             "9424277cf9ae180a14aff09560d3cd60a49c76d2"
+    GIT_TAG             "3c8554f4978628183566ab7dd6c1e7e66493c7cd"
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
-    CMAKE_ARGS          -DUSE_GPU_PLACE=YES
+    CMAKE_ARGS          ${CMAKE_ARGS_PREFIX}
                        -DUSE_X86_PLACE=YES
                        -DBUILD_WITH_UNIT_TEST=NO
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                        -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-                        -DCUDNN_ROOT=${CUDNN_ROOT}
-                        -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
                        -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
                        ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -145,12 +145,12 @@ copy(memory_lib
 set(inference_deps paddle_fluid_shared paddle_fluid)

 set(module "inference/api")
-if (WITH_ANAKIN AND WITH_GPU)
+if (WITH_ANAKIN AND WITH_MKL)
    copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
        SRCS
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
        ${ANAKIN_INSTALL_DIR} # anakin release
-        DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
+        DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
     list(APPEND inference_deps anakin_inference_lib)
 endif()


--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -822,6 +822,14 @@ pad
 ..  autofunction:: paddle.fluid.layers.pad
    :noindex:

+.. _api_fluid_layers_pad_constant_like:
+
+pad_constant_like
+---
+
+..  autofunction:: paddle.fluid.layers.pad_constant_like
+    :noindex:
+
 .. _api_fluid_layers_label_smooth:

 label_smooth
@@ -1145,6 +1153,14 @@ sigmoid
 ..  autofunction:: paddle.fluid.layers.sigmoid
    :noindex:

+.. _api_fluid_layers_hsigmoid:
+
+hsigmoid
+-------
+
+..  autofunction:: paddle.fluid.layers.hsigmoid
+    :noindex:
+
 .. _api_fluid_layers_logsigmoid:

 logsigmoid

--- a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
+++ b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
@@ -104,6 +104,7 @@ visualDL --logdir=scratch_log --port=8080

 # 访问 http://127.0.0.1:8080
 ```
+如果出现`TypeError: __init__() got an unexpected keyword argument 'file'`, 是因为protobuf不是3.5以上，运行`pip install --upgrade protobuf`就能解决。

 如果在虚拟环境下仍然遇到安装问题，请尝试以下方法。


--- a/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
+++ b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
@@ -4,13 +4,12 @@ Paddle 预测 API
 为了更简单方便的预测部署，Fluid 提供了一套高层 API
 用来隐藏底层不同的优化实现。

-`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__
+`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`_
 包括

 -  头文件 ``paddle_inference_api.h`` 定义了所有的接口
 -  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
-  库文件 ``libpaddle_inference_api.so`` 或
-   ``libpaddle_inference_api.a``
+

 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。

@@ -97,8 +96,7 @@ engine
    CHECK(predictor->Run(slots, &outputs));
    // 获取 outputs ...

-编译时，联编 ``libpaddle_fluid.a/.so`` 和
-``libpaddle_inference_api.a/.so`` 便可。
+编译时，联编 ``libpaddle_fluid.a/.so`` 便可。

 详细代码参考
 ------------

--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
@@ -2,42 +2,47 @@

 ## Automatic Differentiation

-A key challenge in the field of deep learning is to automatically derive the backward pass from the forward pass described algorithmically by researchers.  Such a derivation, or a transformation of the forward pass program, has been long studied before the recent prosperity of deep learning in the field known as [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf).
+A key challenge in deep learning is to automatically derive the backward pass given the forward pass as a program, which has been long studied in the field of [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf), or autodiff, before the prosperity of deep learning.

-## The Tape
+## Program Transformation v.s. Backtracking

-Given the forward pass program (usually in Python in practices), there are two strategies to derive the backward pass:
+Given the forward pass program, there are two strategies to derive the backward pass:

-1. from the forward pass program itself, or
-1. from the execution trace of the forward pass program, which is often known as the *tape*.
+1. by transforming the forward pass program without executing it, or
+1. by backtracking the execution process of the forward pass program.

-This article surveys systems that follow the latter strategy.
+This article is about the latter strategy. 

-## Dynamic Network
+## The Tape and Dynamic Networks

-When we train a deep learning model, the tape changes every iteration as the input data change, so we have to re-derive the backward pass every iteration.  This is known as *dynamic network*.
+We refer to the trace of the execution of the forward pass program as a *tape* [[1]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf).  When we train a deep learning model, the tape changes every iteration as the input data change, so we'd have to re-derive the backward pass, which is time-consuming, but also eases the case that the forward program includes control flows like if-else and for/while. With these control flows, the execution trace might change with iterations.  Such changes are known as *dynamic networks* in the field of deep learning.

-Deep learning systems that utilize the idea of dynamic network gained their popularities in recent years.  This article surveys two representative systems: [PyTorch](https://pytorch.org/) and [DyNet](https://dynet.readthedocs.io/en/latest/).
+## Typical Systems

-## An Overview
+Deep learning systems that utilize the idea of dynamic networks gained their popularities in recent years.  This article surveys the following typical systems: 

-Both frameworks record a ‘tape’ of the computation and interpreting (or run-time compiling) a transformation of the tape played back in reverse. This tape is a different kind of entity than the original program.[[link]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf)
+- [DyNet](https://dynet.readthedocs.io/en/latest/)
+- [PyTorch](https://pytorch.org/)
+- Chainer
+- Autograd from HIPS

-Consider the following code feedforward model.
+Before diving into these systems, let us pose an example forward pass program:

 ```python
 x = Variable(randn(20, 1)))
 label = Variable(randint(1))
 W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20))
 h = matmul(W_1, x)
-pred = matmul(W_2, x)
+pred = matmul(W_2, h)
 loss = softmax(pred, label)
 loss.backward()
 ```

-### 1) Dynet uses List to encode the Tape
+## The Representation of Tapes

-During the forward execution, a list of operators, in this case `matmul`, `matmul` and `softmax`, are recorded in the tape, along with the necessary information needed to do the backward such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward()`.
+### DyNet: the Tape as a List
+
+DyNet uses a linear data structure, a list, to represent the tape. During the execution of the above example, it is a list of operators: `matmul`, `matmul`, and `softmax`.  The list also includes information needed to do the backward pass, such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward().`

 <details> 
 <summary></summary>
@@ -69,9 +74,9 @@ digraph g {

 ![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20})

-### 2) Pytorch uses Node Graph to encode the Tape
+### PyTorch: the Tape as a Graph

-The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.
+The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.  Please be aware that a `Function` might have more than one `prev_func`s.

 <details> 
 <summary></summary>
@@ -132,27 +137,22 @@ digraph g {

 ![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20})

-Chainer and Autograd uses the similar techniques to record the forward pass. For details please refer to the appendix.
-
-## Design choices
+Chainer and Autograd use the similar techniques to record the forward pass. For details, please refer to the appendix.

-### 1) Dynet's List vs Pytorch's Node Graph
+## Comparison: List v.s. Graph

-What's good about List:
-1. It avoids a topological sort. One only needs to traverse the list of operators in reverse and calling the corresponding backward operator.
-1. It promises effient data parallelism implementations. One could count the time of usage of a certain variable during the construction list. Then in the play back, one knows the calculation of a variable has completed. This enables communication and computation overlapping.
+The list of DyNet could be considered the result of the topological sort of the graph of PyTorch. Or, the graph is the raw representation of the tape, which gives us the chance to *prune* part of the graph that is irrelevant with the backward pass before the topological sort [[2]](https://openreview.net/pdf?id=BJJsrmfCZ). Consider the following example, PyTorch only does backward on `SmallNet` while DyNet does both `SmallNet` and `BigNet`:

-What's good about Node Graph:
-1. More flexibility. PyTorch users can mix and match independent graphs however they like, in whatever threads they like (without explicit synchronization). An added benefit of structuring graphs this way is that when a portion of the graph becomes dead, it is automatically freed. [[2]](https://openreview.net/pdf?id=BJJsrmfCZ) Consider the following example, Pytorch only does backward on SmallNet while Dynet does both BigNet and SmallNet.
 ```python
 result = BigNet(data)
 loss = SmallNet(data)
 loss.backward()
 ```

-### 2) Dynet's Lazy evaluation vs Pytorch's Immediate evaluation
+## Lazy v.s. Immediate Evaluation
+
+Another difference between DyNet and PyTorch is that DyNet lazily evaluates the forward pass, whereas PyTorch executes it immediately. Consider the following example:

-Dynet builds the list in a symbolic matter. Consider the following example
 ```python
 for epoch in range(num_epochs):
    for in_words, out_label in training_data:
@@ -164,16 +164,17 @@ for epoch in range(num_epochs):
        loss_val = loss_sym.value()
        loss_sym.backward()
 ```
+
 The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion.

-Pytorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
+PyTorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.


-## What can fluid learn from them?
+## Fluid: Learning the Lessons

 Please refer to `paddle/contrib/dynamic/`.

-# Appendix
+## Appendix

 ### Overview


--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -43,6 +43,7 @@ paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list',
 paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
+paddle.fluid.Trainer.save_inference_model ArgSpec(args=['self', 'param_path', 'feeded_var_names', 'target_var_indexes'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None)
@@ -65,7 +66,7 @@ paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'pla
 paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
 paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
-paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0))
+paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0, None))
 paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
@@ -312,7 +313,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
-paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
+paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -376,7 +377,7 @@ paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'l
 paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5))
 paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0))
+paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False))
 paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -326,7 +326,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
  ir::Graph &result = *graph;

  for (auto &node : nodes) {
-    if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) {
+    if (node->IsVar() && node->Var()) {
      all_vars_.emplace(node->Name(), node->Var());
    }
  }
@@ -583,18 +583,6 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
  }
 }

-bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
-    const std::string &og,
-    std::unordered_set<std::string> *og_has_been_broadcast) const {
-  bool is_pg_once =
-      grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
-  if (is_pg_once) {
-    // Insert NCCL AllReduce Op
-    og_has_been_broadcast->insert(og);
-  }
-  return is_pg_once;
-}
-
 int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
                                           ir::Node *node) const {
  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
@@ -688,20 +676,6 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
  return var;
 }

-// Find the first occurence of `prev_op_name` and make current `op` depend
-// on it.
-void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
-                                        const std::string &prev_op_name) const {
-  for (auto &prev_op : result->Get<GraphOps>(kGraphOps)) {
-    if (prev_op->Name() == prev_op_name) {
-      auto *dep_var = new DummyVarHandle(result->CreateControlDepVar());
-      prev_op->AddOutput(dep_var);
-      result->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-      op->AddInput(dep_var);
-    }
-  }
-}
-
 void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
                                                ir::Node *node) const {
  int op_dev_id = -1;

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -69,9 +69,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  std::vector<std::string> FindDistTrainRecvVars(
      const std::vector<ir::Node *> &nodes) const;

-  void ConnectOp(ir::Graph *result, OpHandleBase *op,
-                 const std::string &prev_op_name) const;
-
  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                              size_t num_places) const;

@@ -83,10 +80,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  void CreateComputationalOp(ir::Graph *result, ir::Node *node,
                             int dev_id) const;

-  bool IsParameterGradientOnce(
-      const std::string &og,
-      std::unordered_set<std::string> *og_has_been_broadcast) const;
-
  int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;

  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
 set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
 file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt.  DO NOT EDIT!\n\n")
 file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
-function(pass_library TARGET)
+
+
+# Usage: pass_library(target inference) will append to paddle_inference_pass.h
+function(pass_library TARGET DEST)
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass)
-    file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
-    set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
+    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass ${op_library_DEPS})
+    # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
+    if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
+        message(STATUS "add pass ${TARGET} ${DEST}")
+        file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
+        set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
+    endif()
 endfunction()

 cc_library(node SRCS node.cc DEPS proto_desc)
@@ -18,13 +25,15 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)

-pass_library(graph_to_program_pass)
-pass_library(graph_viz_pass)
-pass_library(fc_fuse_pass)
-pass_library(attention_lstm_fuse_pass)
-pass_library(infer_clean_graph_pass)
-pass_library(fc_lstm_fuse_pass)
-pass_library(seq_concat_fc_fuse_pass)
+pass_library(graph_to_program_pass base)
+pass_library(graph_viz_pass base)
+pass_library(fc_fuse_pass inference)
+pass_library(attention_lstm_fuse_pass inference)
+pass_library(infer_clean_graph_pass inference)
+pass_library(fc_lstm_fuse_pass inference)
+pass_library(fc_gru_fuse_pass inference)
+pass_library(seq_concat_fc_fuse_pass inference)
+
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")

 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)

--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -13,13 +13,10 @@
 // limitations under the License.

 #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
-
 #include <string>
-
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/api/helper.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+static void BuildPattern(PDPattern* pattern, const std::string& name_scope,
+                         bool with_fc_bias) {
+  PDNode* x = pattern->NewNode(name_scope, "x")
+                  ->assert_is_op_input("mul")
+                  ->assert_var_not_persistable();
+  auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias);
+  fc_out->AsIntermediate();  // fc_out is a tmp var, will be removed after fuse.
+  patterns::GRU(pattern, name_scope, fc_out);
+  VLOG(3) << "fc_gru pattern \n" << pattern->DotString();
+}
+
+static int BuildFusion(Graph* graph, const std::string& name_scope,
+                       Scope* scope, bool with_fc_bias) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  BuildPattern(pattern, name_scope, with_fc_bias);
+
+  // Create New OpDesc
+  auto gru_creater = [&](int gru, int x, int weight_x, int weight_h, int bias,
+                         int hidden, int fc_bias) {
+#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
+    GET_NODE(x);
+    GET_NODE(weight_x);
+    GET_NODE(weight_h);
+    GET_NODE(bias);
+    GET_NODE(hidden);
+    GET_NODE(gru);
+
+    OpDesc op_desc;
+    op_desc.SetType("fusion_gru");
+
+#define NEW_NAME(x) name_scope + "/at." #x ".new"
+#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()});
+    SET_IN(X, x);
+    SET_IN(WeightX, weight_x);
+    SET_IN(WeightH, weight_h);
+    if (with_fc_bias) {
+      op_desc.SetInput("Bias", {NEW_NAME(bias) + bias_n->Name()});
+    } else {
+      SET_IN(Bias, bias);
+    }
+#undef SET_IN
+    op_desc.SetInput("H0", {});
+    op_desc.SetOutput("Hidden", {hidden_n->Name()});
+    op_desc.SetAttr("is_reverse", gru_n->Op()->GetAttr("is_reverse"));
+    // TODO(TJ): This should be a option for infer
+    op_desc.SetAttr("use_seq", true);
+
+#define SET_IMTERMEDIATE_OUT(key) op_desc.SetOutput(#key, {NEW_NAME(key)})
+    SET_IMTERMEDIATE_OUT(ReorderedH0);
+    SET_IMTERMEDIATE_OUT(XX);
+    SET_IMTERMEDIATE_OUT(BatchedInput);
+    SET_IMTERMEDIATE_OUT(BatchedOut);
+#undef SET_IMTERMEDIATE_OUT
+
+    auto* op = graph->CreateOpNode(&op_desc);
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+    PADDLE_ENFORCE(scope);
+    if (with_fc_bias) {
+      // Fusion GRU bias = fcbias + grubias
+      auto* fusion_bias_var = scope->Var(NEW_NAME(bias) + bias_n->Name());
+      auto* out_bias_tensor =
+          fusion_bias_var->GetMutable<framework::LoDTensor>();
+      PADDLE_ENFORCE(fusion_bias_var);
+      GET_NODE(fc_bias);
+      PADDLE_ENFORCE(fc_bias_n);
+      auto* gru_bias_var = scope->FindVar(bias_n->Name());
+      auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
+      PADDLE_ENFORCE(gru_bias_var);
+      PADDLE_ENFORCE(fc_bias_var);
+      const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
+      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
+      // new bias = fc bias + gru bias
+      out_bias_tensor->Resize(gru_bias_tenosr.dims());
+      auto* data = out_bias_tensor->mutable_data<float>(platform::CPUPlace());
+      for (int i = 0; i < out_bias_tensor->numel(); i++) {
+        data[i] =
+            fc_bias_tensor.data<float>()[i] + gru_bias_tenosr.data<float>()[i];
+      }
+    }
+#undef GET_NODE
+
+#define NEW_IMTERMEDIATE_OUT(key) \
+  scope->Var(NEW_NAME(key))->GetMutable<framework::LoDTensor>()
+    NEW_IMTERMEDIATE_OUT(ReorderedH0);
+    NEW_IMTERMEDIATE_OUT(XX);
+    NEW_IMTERMEDIATE_OUT(BatchedInput);
+    NEW_IMTERMEDIATE_OUT(BatchedOut);
+#undef NEW_NAME
+#undef NEW_IMTERMEDIATE_OUT
+
+    IR_NODE_LINK_TO(x_n, op);
+    IR_NODE_LINK_TO(weight_x_n, op);
+    IR_NODE_LINK_TO(weight_h_n, op);
+    IR_NODE_LINK_TO(bias_n, op);  // actually should link to new bias if have
+    IR_NODE_LINK_TO(op, hidden_n);
+    // h0?
+    return op;
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+#define GET_NODE(name__)                                \
+  std::string name__##key = name_scope + "/" + #name__; \
+  auto* name__##n = pattern->RetrieveNode(name__##key); \
+  PADDLE_ENFORCE(name__##n);                            \
+  PADDLE_ENFORCE(subgraph.count(name__##n));            \
+  Node* name__##_n = subgraph.at(name__##n);            \
+  int name__ __attribute__((unused)) = name__##_n->id();
+
+    GET_NODE(x);
+    GET_NODE(w);  // fc weight
+    GET_NODE(mul);
+    GET_NODE(fc_out);
+    GET_NODE(Weight);
+    GET_NODE(gru);
+    GET_NODE(Bias);
+    GET_NODE(Hidden);
+    // nodes need be removed
+    GET_NODE(BatchGate);
+    GET_NODE(BatchResetHiddenPrev);
+    GET_NODE(BatchHidden);
+
+    if (with_fc_bias) {
+      GET_NODE(mul_out);
+      GET_NODE(fc_bias);
+      GET_NODE(elementwise_add);
+      gru_creater(gru, x, w, Weight, Bias, Hidden, fc_bias);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul_n, gru_n, elementwise_add_n, fc_bias_n, fc_out_n, mul_out_n,
+           BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    } else {
+      gru_creater(gru, x, w, Weight, Bias, Hidden, -1);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul_n, gru_n, BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    }
+#undef GET_NODE
+
+    ++fusion_count;
+  };
+
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+std::unique_ptr<ir::Graph> MulGRUFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 false /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+std::unique_ptr<ir::Graph> FCGRUFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 true /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
+REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
+
+class FCGRUFusePass : public FusePassBase {
+ public:
+  virtual ~FCGRUFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  const std::string name_scope_{"fc_gru_fuse"};
+};
+
+// Just FC without bias
+class MulGRUFusePass : public FusePassBase {
+ public:
+  virtual ~MulGRUFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"fc_nobias_gru_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -20,12 +20,13 @@ namespace paddle {
 namespace framework {
 namespace ir {

-std::string GenNodeName(const std::string& prefix, const std::string& name) {
+static std::string GenNodeName(const std::string& prefix,
+                               const std::string& name) {
  return prefix + "/" + name;
 }

-void BuildPattern(PDPattern* pattern, const std::string& name_scope,
-                  bool with_fc_bias) {
+static void BuildPattern(PDPattern* pattern, const std::string& name_scope,
+                         bool with_fc_bias) {
  PDNode* x = pattern->NewNode(name_scope, "x")
                  ->assert_is_op_input("mul")
                  ->assert_var_not_persistable();
@@ -35,8 +36,8 @@ void BuildPattern(PDPattern* pattern, const std::string& name_scope,
  // LOG(INFO) << "\n" << pattern->DotString();
 }

-int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
-                bool with_fc_bias) {
+static int BuildFusion(Graph* graph, const std::string& name_scope,
+                       Scope* scope, bool with_fc_bias) {
  GraphPatternDetector gpd;
  auto* pattern = gpd.mutable_pattern();

@@ -87,15 +88,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
      }
      op_desc.SetInput("Bias", {new_bias_var});
    }
-
 #undef GET_NODE

+    // Create temp variables.
+    scope->Var(name_scope + "/BatchedInput.new")
+        ->GetMutable<framework::LoDTensor>();
+    scope->Var(name_scope + "/BatchCellPreAct.new")
+        ->GetMutable<framework::LoDTensor>();
+    scope->Var(name_scope + "/BatchedGate.new")
+        ->GetMutable<framework::LoDTensor>();
+
    op_desc.SetInput("H0", {});
    op_desc.SetInput("C0", {});
    op_desc.SetOutput("Hidden", {hidden_n->Name()});
    op_desc.SetOutput("Cell", {cell_n->Name()});
    op_desc.SetOutput("XX", {xx_n->Name()});
-    op_desc.SetOutput("BatchedInput", {"blstm_0.tmp_2"});
+    op_desc.SetOutput("BatchedGate", {name_scope + "/BatchedGate.new"});
+    op_desc.SetOutput("BatchCellPreAct", {name_scope + "/BatchCellPreAct.new"});
+    op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"});
    op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
    op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes"));
    // TODO(TJ): get from attr
@@ -131,8 +141,8 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,

  int fusion_count{0};

-  auto fc_no_bias_handler = [&](
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* g) {
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
 #define GET_NODE(name__)                                \
  std::string name__##key = name_scope + "/" + #name__; \
  auto* name__##n = pattern->RetrieveNode(name__##key); \
@@ -153,21 +163,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,

    if (with_fc_bias) {
      GET_NODE(fc_bias);
+      GET_NODE(elementwise_add);
      lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul_n, lstm_n, elementwise_add_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
    } else {
      lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
    }
 #undef GET_NODE

-    // Remove unneeded nodes.
-    std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
-
-    GraphSafeRemoveNodes(graph, marked_nodes);
-
    ++fusion_count;
  };

-  gpd(graph, fc_no_bias_handler);
+  gpd(graph, handler);

  return fusion_count;
 }

--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#pragma once
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -73,7 +73,6 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) {
 void GraphPatternDetector::operator()(Graph* graph,
                                      GraphPatternDetector::handle_t handler) {
  if (!MarkPDNodesInGraph(*graph)) {
-    LOG(INFO) << "Mark failed";
    return;
  }

@@ -86,7 +85,7 @@ void GraphPatternDetector::operator()(Graph* graph,
  LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
  int id = 0;
  for (auto& g : subgraphs) {
-    LOG(INFO) << "optimizing #" << id++ << " subgraph";
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
    handler(g, graph);
  }
 }
@@ -520,76 +519,96 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) {

 PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope,
                     PDNode* x, bool with_bias) {
-  // Create Operators
-  PDNode* elementwise_add_op{nullptr};
+  // mul op
  auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul");
-  if (with_bias) {
-    elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
-                             ->assert_is_op("elementwise_add");
-  }
-  // Create variables
-  // w
  auto* mul_weight_var = pattern->NewNode(name_scope, "w")
                             ->AsInput()
                             ->assert_is_persistable_var()
-                             ->assert_is_op_nth_input("mul", "Y", 0);
-  PDNode* mul_out_var{nullptr};
+                             ->assert_is_op_input("mul", "Y");
+
+  PDNode* fc_out{nullptr};
  if (with_bias) {
+    PDNode* elementwise_add_op{nullptr};
+    PDNode *mul_out_var{nullptr}, *bias{nullptr};
+    elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
+                             ->assert_is_op("elementwise_add");
    // intermediate variable, will be removed in the IR after fuse.
    mul_out_var = pattern->NewNode(name_scope, "mul_out")
                      ->AsIntermediate()
                      ->assert_is_only_output_of_op("mul")
                      ->assert_is_op_input("elementwise_add");
-  }
-  PDNode *bias{nullptr}, *fc_out{nullptr};
-  if (with_bias) {
    // bias
    bias = pattern->NewNode(name_scope, "fc_bias")
-               ->assert_is_op_input("elementwise_add")
-               ->AsInput();
+               ->AsInput()
+               ->assert_is_op_input("elementwise_add");
    // output
    fc_out = pattern->NewNode(name_scope, "fc_out")
                 ->AsOutput()
                 ->assert_is_op_output("elementwise_add");
+    mul_op->LinksFrom({x, mul_weight_var}).LinksTo({mul_out_var});
+    elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
  } else {
    fc_out = pattern->NewNode(name_scope, "fc_out")
                 ->AsOutput()
                 ->assert_is_op_output("mul");
-  }
-
-  if (with_bias) {
-    mul_op->LinksFrom({mul_weight_var, x}).LinksTo({mul_out_var});
-    elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
-  } else {
    mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out});
  }
-
  return fc_out;
 }
+
+#define NEW_NODE(op__, arg__, io__)                  \
+  auto* arg__ = pattern->NewNode(name_scope, #arg__) \
+                    ->assert_is_op_##io__(#op__, #arg__);
+
 PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
                       PDNode* x) {
  x->assert_is_op_input("lstm", "Input");
  auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm");
-#define NEW_NODE(arg__, io__)                        \
-  auto* arg__ = pattern->NewNode(name_scope, #arg__) \
-                    ->assert_is_op_##io__("lstm", #arg__);

  // Currently, the H0 and C0 are optional
  // TODO(Superjomn) upgrade the fuse framework to support optional.
  // NEW_NODE(H0, input);
  // NEW_NODE(C0, input);
-  NEW_NODE(Weight, input);
-  NEW_NODE(Bias, input);
+  NEW_NODE(lstm, Weight, input);
+  NEW_NODE(lstm, Bias, input);

-  NEW_NODE(Hidden, output);
-  NEW_NODE(Cell, output);
-  NEW_NODE(BatchGate, output);
-  NEW_NODE(BatchCellPreAct, output);
+  NEW_NODE(lstm, Hidden, output);
+  NEW_NODE(lstm, Cell, output);
+  NEW_NODE(lstm, BatchGate, output);
+  NEW_NODE(lstm, BatchCellPreAct, output);

  lstm_op->LinksFrom({x, Weight, Bias});
  lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
  return Hidden;
 }
+
+PDNode* patterns::GRU(PDPattern* pattern, const std::string& name_scope,
+                      PDNode* x) {
+  x->assert_is_op_input("gru", "Input");
+  auto* gru_op = pattern->NewNode(name_scope, "gru")->assert_is_op("gru");
+
+  NEW_NODE(gru, Weight, input);
+  // TODO(Superjomn): upgrade the fuse framework to support optional.
+  // H0 and bias are optional
+  NEW_NODE(gru, Bias, input);  // also optional
+  // NEW_NODE(H0, input);
+
+  NEW_NODE(gru, Hidden, output);
+  // below are intermediate
+  NEW_NODE(gru, BatchGate, output);
+  NEW_NODE(gru, BatchResetHiddenPrev, output);
+  NEW_NODE(gru, BatchHidden, output);
+
+  BatchGate->AsIntermediate();
+  BatchResetHiddenPrev->AsIntermediate();
+  BatchHidden->AsIntermediate();
+
+  gru_op->LinksFrom({x, Weight, Bias});
+  gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden});
+  return Hidden;
+}
+#undef NEW_NODE
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -19,6 +19,9 @@
 #endif

 #include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
@@ -295,6 +298,8 @@ PDNode* FC(PDPattern* pattern, const std::string& name_scope, PDNode* x,

 PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);

+PDNode* GRU(PDPattern* pattern, const std::string& name_scope, PDNode* x);
+
 }  // namespace patterns

 #define IR_NODE_LINK_TO(a, b) \

--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -50,20 +50,37 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(

  Dot dot;

-  std::vector<Dot::Attr> op_attrs({Dot::Attr("style", "filled"),
-                                   Dot::Attr("shape", "box"),
-                                   Dot::Attr("fillcolor", "red")});
-  std::vector<Dot::Attr> var_attrs({Dot::Attr("style", "filled,rounded"),
-                                    // Dot::Attr("shape", "diamond"),
-                                    Dot::Attr("fillcolor", "yellow")});
-
-  std::vector<Dot::Attr> marked_op_attrs({Dot::Attr("style", "filled"),
-                                          Dot::Attr("shape", "box"),
-                                          Dot::Attr("fillcolor", "lightgray")});
-  std::vector<Dot::Attr> marked_var_attrs(
-      {Dot::Attr("style", "filled,rounded"),
-       // Dot::Attr("shape", "diamond"),
-       Dot::Attr("fillcolor", "lightgray")});
+  const std::vector<Dot::Attr> op_attrs({
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("color", "#303A3A"),              //
+      Dot::Attr("fontcolor", "#ffffff"),          //
+      Dot::Attr("width", "1.3"),                  //
+      Dot::Attr("height", "0.84"),                //
+      Dot::Attr("fontname", "Arial"),             //
+  });
+  const std::vector<Dot::Attr> arg_attrs({
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("fontname", "Arial"),             //
+      Dot::Attr("fillcolor", "#999999"),          //
+      Dot::Attr("color", "#dddddd"),              //
+  });
+
+  const std::vector<Dot::Attr> param_attrs({
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("fontname", "Arial"),             //
+      Dot::Attr("color", "#148b97"),              //
+      Dot::Attr("fontcolor", "#ffffff"),          //
+  });
+
+  const std::vector<Dot::Attr> marked_op_attrs(
+      {Dot::Attr("style", "rounded,filled,bold"), Dot::Attr("shape", "box"),
+       Dot::Attr("fillcolor", "yellow")});
+  const std::vector<Dot::Attr> marked_var_attrs(
+      {Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"),
+       Dot::Attr("fillcolor", "yellow")});

  auto marked_nodes = ConsumeMarkedNodes(graph.get());
  // Create nodes
@@ -74,9 +91,17 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
          marked_nodes.count(n) ? marked_op_attrs : op_attrs;
      dot.AddNode(node_id, attr, node_id);
    } else if (n->IsVar()) {
-      decltype(op_attrs) attr =
-          marked_nodes.count(n) ? marked_var_attrs : var_attrs;
-      dot.AddNode(node_id, attr, node_id);
+      decltype(op_attrs)* attr;
+      if (marked_nodes.count(n)) {
+        attr = &marked_var_attrs;
+      } else if (const_cast<Node*>(n)->Var() &&
+                 const_cast<Node*>(n)->Var()->Persistable()) {
+        attr = &param_attrs;
+      } else {
+        attr = &arg_attrs;
+      }
+
+      dot.AddNode(node_id, *attr, node_id);
    }
    node2dot[n] = node_id;
  }

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -6,6 +6,7 @@ cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits
  analyzer.cc
  helper.cc
  # passes
+  analysis_pass.cc
  fluid_to_data_flow_graph_pass.cc
  data_flow_graph_to_fluid_pass.cc
  dfg_graphviz_draw_pass.cc
@@ -58,7 +59,7 @@ endif()
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
    ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
-        --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
+         --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)

 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
 inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
@@ -74,25 +75,42 @@ inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
 set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
 set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
 set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE)
-if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING)
+if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
  inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz")
  inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
 endif()

 inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
    ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
        --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)

 set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
 set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
 set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE)
-if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING)
+if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz")
    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz")
 endif()

 inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
    ARGS --infer_model=${LAC_INSTALL_DIR}/model
        --infer_data=${LAC_INSTALL_DIR}/data.txt)
+
+
+set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
+set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
+set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE)
+
+if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
+  inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
+  inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz")
+endif()
+
+inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
+    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
+         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
+         --topn=1 # Just run top 1 batch.
+    )
--- a/paddle/fluid/inference/analysis/pass.cc
+++ b/paddle/fluid/inference/analysis/pass.cc
@@ -12,4 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -28,10 +28,10 @@ namespace paddle {
 namespace inference {
 namespace analysis {

-class Pass {
+class AnalysisPass {
 public:
-  Pass() = default;
-  virtual ~Pass() = default;
+  AnalysisPass() = default;
+  virtual ~AnalysisPass() = default;
  // Mutable Pass.
  virtual bool Initialize(Argument *argument) { return false; }
  // Readonly Pass.
@@ -42,23 +42,16 @@ class Pass {
  virtual bool Finalize() { return false; }

  // Get a Pass appropriate to print the Node this pass operates on.
-  virtual Pass *CreatePrinterPass(std::ostream &os,
-                                  const std::string &banner) const {
+  virtual AnalysisPass *CreatePrinterPass(std::ostream &os,
+                                          const std::string &banner) const {
    return nullptr;
  }

  // Create a debugger Pass that draw the DFG by graphviz toolkit.
-  virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
+  virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }

-  virtual void Run() { LOG(FATAL) << "not valid"; }
-  // Run on a single Node.
-  virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
-  // Run on a single Function.
-  virtual void Run(Function *x) { LOG(FATAL) << "not valid"; }
-  // Run on a single FunctionBlock.
-  virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
  // Run on a single DataFlowGraph.
-  virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
+  virtual void Run(DataFlowGraph *x) = 0;

  // Human-readable short representation.
  virtual std::string repr() const = 0;
@@ -66,29 +59,8 @@ class Pass {
  virtual std::string description() const { return "No DOC"; }
 };

-// NodePass process on any Node types.
-class NodePass : public Pass {
- public:
-  virtual void Run(Node *node) = 0;
-};
-
-// NodePass process on any Function node types.
-class FunctionPass : public Pass {
- public:
-  virtual void Run(Function *node) = 0;
-};
-
-// NodePass process on any FunctionBlock node types.
-class FunctionBlockPass : public Pass {
- public:
-  virtual void Run(FunctionBlock *node) = 0;
-};
-
 // GraphPass processes on any GraphType.
-class DataFlowGraphPass : public Pass {
- public:
-  virtual void Run(DataFlowGraph *graph) = 0;
-};
+class DataFlowGraphPass : public AnalysisPass {};

 }  // namespace analysis
 }  // namespace inference

--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -14,6 +14,8 @@

 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
+#include <vector>
+
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
@@ -41,27 +43,23 @@ class DfgPassManagerImpl final : public DfgPassManager {
 public:
  DfgPassManagerImpl() {
    // TODO(Superjomn) set the key with pass reprs.
-    LOG(INFO)
-        << "-----------------------------------------------------------------";
-    if (FLAGS_IA_enable_ir) {
-      AddPass("fluid-to-ir-pass", new FluidToIrPass);
-    } else {
+    if (!FLAGS_IA_enable_ir) {
      AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    } else {
+      AddPass("fluid-to-ir-pass", new FluidToIrPass);
    }
    TryAddTensorRtPass();
    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
    if (!FLAGS_IA_output_storage_path.empty()) {
      AddPass("model-store-pass", new ModelStorePass);
    }
-    LOG(INFO)
-        << "-----------------------------------------------------------------";
  }

  std::string repr() const override { return "dfg-pass-manager"; }
  std::string description() const override { return "DFG pass manager."; }

 private:
-  void AddPass(const std::string& name, Pass* pass) {
+  void AddPass(const std::string& name, AnalysisPass* pass) {
    VLOG(3) << "Adding pass " << name;
    Register(name, pass);
    AddGraphvizDebugerPass(pass);
@@ -90,7 +88,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
  }

  // Add the graphviz debuger pass if the parent pass has one.
-  void AddGraphvizDebugerPass(Pass* pass) {
+  void AddGraphvizDebugerPass(AnalysisPass* pass) {
    auto* debuger_pass = pass->CreateGraphvizDebugerPass();
    if (debuger_pass) {
      Register(debuger_pass->repr(), debuger_pass);
@@ -101,19 +99,15 @@ class DfgPassManagerImpl final : public DfgPassManager {
 Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }

 void Analyzer::Run(Argument* argument) {
-  // Ugly support fluid-to-ir-pass
-  argument->Set(kFluidToIrPassesAttr,
-                new std::vector<std::string>({
-                    // Manual update the passes here.
-                    "graph_viz_pass",                              //
-                    "infer_clean_graph_pass", "graph_viz_pass",    //
-                    "attention_lstm_fuse_pass", "graph_viz_pass",  //
-                    "fc_lstm_fuse_pass", "graph_viz_pass",         //
-                    "mul_lstm_fuse_pass", "graph_viz_pass",        //
-                    "seq_concat_fc_fuse_pass", "graph_viz_pass",   //
-                    "fc_fuse_pass", "graph_viz_pass"               //
-
-                }));
+  std::vector<std::string> passes;
+  for (auto& pass : all_ir_passes_) {
+    if (!disabled_ir_passes_.count(pass)) {
+      passes.push_back(pass);
+      passes.push_back("graph_viz_pass");  // add graphviz for debug.
+    }
+  }
+  passes.push_back("graph_viz_pass");
+  argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));

  for (auto& x : data_) {
    PADDLE_ENFORCE(x->Initialize(argument));
@@ -122,6 +116,11 @@ void Analyzer::Run(Argument* argument) {
  }
 }

+Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
+  disabled_ir_passes_.insert(passes.begin(), passes.end());
+  return *this;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -36,16 +36,12 @@ limitations under the License. */
 */

 #include <gflags/gflags.h>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"

-// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
-// flag if not available.
-DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
-DECLARE_string(IA_graphviz_log_root);
-DECLARE_string(IA_output_storage_path);
-DECLARE_bool(IA_enable_ir);
-
 namespace paddle {
 namespace inference {
 namespace analysis {
@@ -57,7 +53,28 @@ class Analyzer : public OrderedRegistry<PassManager> {

  void Run(Argument* argument);

+  Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
+
  DISABLE_COPY_AND_ASSIGN(Analyzer);
+
+ private:
+  // All avaiable IR passes.
+  // The bigger fuse comes first, so that the small operators prefer to be
+  // merged in a larger fuse op. The small fusion will not break the pattern of
+  // larger fusion.
+  const std::vector<std::string> all_ir_passes_{{
+      // Manual update the passes here.
+      "infer_clean_graph_pass",    //
+      "attention_lstm_fuse_pass",  //
+      "fc_lstm_fuse_pass",         //
+      "mul_lstm_fuse_pass",        //
+      "fc_gru_fuse_pass",          //
+      "mul_gru_fuse_pass",         //
+      "seq_concat_fc_fuse_pass",   //
+      "fc_fuse_pass",              //
+  }};
+
+  std::unordered_set<std::string> disabled_ir_passes_;
 };

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -11,13 +11,14 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/inference/analysis/analyzer.h"
-#include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/platform/profiler.h"

 DEFINE_string(infer_model, "", "model path for LAC");
@@ -102,6 +103,7 @@ struct DataRecord {
    return data;
  }
 };
+
 void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
                 int batch_size) {
  auto one_batch = data->NextBatch();
@@ -114,12 +116,7 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
  input_slots->assign({input_tensor});
 }
-static void PrintTime(const double latency, const int bs, const int repeat) {
-  LOG(INFO) << "===========profile result===========";
-  LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat
-            << ", avg latency: " << latency / repeat << "ms";
-  LOG(INFO) << "=====================================";
-}
+
 void BenchAllData(const std::string &model_path, const std::string &data_file,
                  const int batch_size, const int repeat) {
  NativeConfig config;
@@ -145,19 +142,18 @@ void BenchAllData(const std::string &model_path, const std::string &data_file,
      sum += timer.toc();
    }
  }
-  PrintTime(sum, batch_size, repeat);
+  PrintTime(batch_size, repeat, 1, 0, sum / repeat);
 }
+
 const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
                                25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
                                44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
                                14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
+
 void TestLACPrediction(const std::string &model_path,
                       const std::string &data_file, const int batch_size,
-                       const int repeat, bool test_all_data) {
-  if (test_all_data) {
-    BenchAllData(model_path, data_file, batch_size, repeat);
-    return;
-  }
+                       const int repeat, bool test_all_data,
+                       bool use_analysis = false) {
  NativeConfig config;
  config.model_dir = model_path;
  config.use_gpu = false;
@@ -166,17 +162,47 @@ void TestLACPrediction(const std::string &model_path,
  std::vector<PaddleTensor> input_slots, outputs_slots;
  DataRecord data(data_file, batch_size);
  GetOneBatch(&input_slots, &data, batch_size);
-  auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  std::unique_ptr<PaddlePredictor> predictor;
+  if (use_analysis) {
+    AnalysisConfig cfg;
+    cfg.model_dir = model_path;
+    cfg.use_gpu = false;
+    cfg.device = 0;
+    cfg.specify_input_name = true;
+    cfg.enable_ir_optim = true;
+    predictor =
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+  } else {
+    predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  }
  for (int i = 0; i < FLAGS_burning; i++) {
    predictor->Run(input_slots, &outputs_slots);
  }
  Timer timer;
+  if (test_all_data) {
+    double sum = 0;
+    LOG(INFO) << "Total number of samples: " << data.datasets.size();
+    for (int i = 0; i < repeat; i++) {
+      for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
+        GetOneBatch(&input_slots, &data, batch_size);
+        timer.tic();
+        predictor->Run(input_slots, &outputs_slots);
+        sum += timer.toc();
+      }
+    }
+    PrintTime(batch_size, repeat, 1, 0, sum / repeat);
+    LOG(INFO) << "Average latency of each sample: "
+              << sum / repeat / data.datasets.size() << " ms";
+    return;
+  }
  timer.tic();
  for (int i = 0; i < repeat; i++) {
    predictor->Run(input_slots, &outputs_slots);
  }
-  PrintTime(timer.toc(), batch_size, repeat);
+  PrintTime(batch_size, repeat, 1, 0, timer.toc() / repeat);
+
+  // check result
  EXPECT_EQ(outputs_slots.size(), 1UL);
  auto &out = outputs_slots[0];
  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
@@ -188,12 +214,60 @@ void TestLACPrediction(const std::string &model_path,
  for (size_t i = 0; i < batch1_size; ++i) {
    EXPECT_EQ(pdata[i], lac_ref_data[i]);
  }
+
+  if (use_analysis) {
+    // run once for comparion as reference
+    auto ref_predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+    std::vector<PaddleTensor> ref_outputs_slots;
+    ref_predictor->Run(input_slots, &ref_outputs_slots);
+    EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
+    auto &ref_out = ref_outputs_slots[0];
+    size_t ref_size =
+        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
+                        [](int a, int b) { return a * b; });
+    EXPECT_EQ(size, ref_size);
+    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
+    for (size_t i = 0; i < size; ++i) {
+      EXPECT_EQ(pdata_ref[i], pdata[i]);
+    }
+
+    AnalysisPredictor *analysis_predictor =
+        dynamic_cast<AnalysisPredictor *>(predictor.get());
+    auto &fuse_statis = analysis_predictor->analysis_argument()
+                            .Get<std::unordered_map<std::string, int>>(
+                                framework::ir::kFuseStatisAttr);
+    for (auto &item : fuse_statis) {
+      LOG(INFO) << "fused " << item.first << " " << item.second;
+    }
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
+    EXPECT_EQ(num_ops, 11);
+  }
 }
+
 TEST(Analyzer_LAC, native) {
  LOG(INFO) << "LAC with native";
  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
                    FLAGS_repeat, FLAGS_test_all_data);
 }
+
+TEST(Analyzer_LAC, analysis) {
+  LOG(INFO) << "LAC with analysis";
+  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
+                    FLAGS_repeat, FLAGS_test_all_data, true);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
@@ -13,18 +13,19 @@
 // limitations under the License.

 #include "paddle/fluid/inference/analysis/analyzer.h"
-#include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/platform/profiler.h"

 DEFINE_string(infer_model, "", "model path");
 DEFINE_string(infer_data, "", "data path");
 DEFINE_int32(batch_size, 10, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");

 namespace paddle {
 namespace inference {
@@ -35,6 +36,7 @@ struct DataRecord {
  std::vector<size_t> lod;  // two inputs have the same lod info.
  size_t batch_iter{0};
  size_t batch_size{1};
+  size_t num_samples;  // total number of samples
  DataRecord() = default;
  explicit DataRecord(const std::string &path, int batch_size = 1)
      : batch_size(batch_size) {
@@ -81,6 +83,7 @@ struct DataRecord {
      word_data_all.push_back(std::move(word_data));
      mention_data_all.push_back(std::move(mention_data));
    }
+    num_samples = num_lines;
  }
 };

@@ -109,7 +112,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
                                       48, 39, 38, 16, 25};

-void TestChineseNERPrediction() {
+void TestChineseNERPrediction(bool use_analysis) {
  NativeConfig config;
  config.prog_file = FLAGS_infer_model + "/__model__";
  config.param_file = FLAGS_infer_model + "/param";
@@ -117,24 +120,53 @@ void TestChineseNERPrediction() {
  config.device = 0;
  config.specify_input_name = true;

-  auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-  std::vector<PaddleTensor> input_slots;
-  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  std::vector<PaddleTensor> input_slots, outputs;
+  std::unique_ptr<PaddlePredictor> predictor;
+  Timer timer;
+  if (use_analysis) {
+    AnalysisConfig cfg;
+    cfg.prog_file = FLAGS_infer_model + "/__model__";
+    cfg.param_file = FLAGS_infer_model + "/param";
+    cfg.use_gpu = false;
+    cfg.device = 0;
+    cfg.specify_input_name = true;
+    cfg.enable_ir_optim = true;
+    predictor =
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+  } else {
+    predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  }
+
+  if (FLAGS_test_all_data) {
+    LOG(INFO) << "test all data";
+    double sum = 0;
+    size_t num_samples;
+    for (int i = 0; i < FLAGS_repeat; i++) {
+      DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+      num_samples = data.num_samples;
+      for (size_t bid = 0; bid < num_samples; ++bid) {
+        PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+        timer.tic();
+        predictor->Run(input_slots, &outputs);
+        sum += timer.toc();
+      }
+    }
+    LOG(INFO) << "total number of samples: " << num_samples;
+    PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
+    LOG(INFO) << "average latency of each sample: "
+              << sum / FLAGS_repeat / num_samples;
+    return;
+  }
  // Prepare inputs.
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
  PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-  std::vector<PaddleTensor> outputs;

-  Timer timer;
  timer.tic();
  for (int i = 0; i < FLAGS_repeat; i++) {
    predictor->Run(input_slots, &outputs);
  }
-  LOG(INFO) << "===========profile result===========";
-  LOG(INFO) << "batch_size: " << FLAGS_batch_size
-            << ", repeat: " << FLAGS_repeat
-            << ", latency: " << timer.toc() / FLAGS_repeat << "ms";
-  LOG(INFO) << "=====================================";
+  PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, timer.toc() / FLAGS_repeat);

  PADDLE_ENFORCE(outputs.size(), 1UL);
  auto &out = outputs[0];
@@ -145,10 +177,51 @@ void TestChineseNERPrediction() {
  for (size_t i = 0; i < std::min(11UL, size); i++) {
    PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]);
  }
+
+  if (use_analysis) {
+    // run once for comparion as reference
+    auto ref_predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+    std::vector<PaddleTensor> ref_outputs_slots;
+    ref_predictor->Run(input_slots, &ref_outputs_slots);
+    EXPECT_EQ(ref_outputs_slots.size(), outputs.size());
+    auto &ref_out = ref_outputs_slots[0];
+    size_t ref_size =
+        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
+                        [](int a, int b) { return a * b; });
+    EXPECT_EQ(size, ref_size);
+    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
+    for (size_t i = 0; i < size; ++i) {
+      EXPECT_EQ(pdata_ref[i], result[i]);
+    }
+
+    AnalysisPredictor *analysis_predictor =
+        dynamic_cast<AnalysisPredictor *>(predictor.get());
+    auto &fuse_statis = analysis_predictor->analysis_argument()
+                            .Get<std::unordered_map<std::string, int>>(
+                                framework::ir::kFuseStatisAttr);
+    for (auto &item : fuse_statis) {
+      LOG(INFO) << "fused " << item.first << " " << item.second;
+    }
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
+    EXPECT_EQ(num_ops, 14);
+  }
 }

-// Directly infer with the original model.
-TEST(Analyzer, Chinese_ner) { TestChineseNERPrediction(); }
+TEST(Analyzer_Chinese_ner, native) { TestChineseNERPrediction(false); }
+
+TEST(Analyzer_Chinese_ner, analysis) { TestChineseNERPrediction(true); }

 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -16,6 +16,7 @@

 #include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
+#include <thread>  // NOLINT
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
@@ -24,12 +25,12 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/platform/profiler.h"

 DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
 DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
 DEFINE_int32(batch_size, 10, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");

 namespace paddle {
 namespace inference {
@@ -220,39 +221,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }

-std::string DescribeTensor(const PaddleTensor &tensor) {
-  std::stringstream os;
-  os << "Tensor [" << tensor.name << "]\n";
-  os << " - type: ";
-  switch (tensor.dtype) {
-    case PaddleDType::FLOAT32:
-      os << "float32";
-      break;
-    case PaddleDType::INT64:
-      os << "int64";
-      break;
-    default:
-      os << "unset";
-  }
-  os << '\n';
-
-  os << " - shape: " << to_string(tensor.shape) << '\n';
-  os << " - lod: ";
-  for (auto &l : tensor.lod) {
-    os << to_string(l) << "; ";
-  }
-  os << "\n";
-  os << " - data: ";
-
-  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
-                            [](int a, int b) { return a * b; });
-  for (int i = 0; i < dim; i++) {
-    os << static_cast<float *>(tensor.data.data())[i] << " ";
-  }
-  os << '\n';
-  return os.str();
-}
-
 }  // namespace

 const float ditu_rnn_target_data[] = {
@@ -266,55 +234,93 @@ const float ditu_rnn_target_data[] = {
    10.7286, 12.0595, 10.6672, 0,       0,       0,       0,       0,
    93.5771, 3.84641, 0,       0,       0,       0,       0,       0,
    169.426, 0,       0,       0,       0,       0,       0,       0};
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<PaddleTensor> &base_outputs) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &base_out = base_outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
+                                   1, [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_EQ(size, size1);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *data = static_cast<float *>(out.data.data());
+    float *base_data = static_cast<float *>(base_out.data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+    }
+  }
+}
 // Test with a really complicate model.
-void TestDituRNNPrediction(const std::string &model_path,
-                           const std::string &data_path, int batch_size,
-                           bool use_analysis, bool activate_ir,
-                           int num_times = 1) {
-  NativeConfig config;
+void TestDituRNNPrediction(bool use_analysis, bool activate_ir,
+                           int num_threads) {
+  AnalysisConfig config;
  config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
  config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
  config.use_gpu = false;
  config.device = 0;
  config.specify_input_name = true;
+  config.enable_ir_optim = activate_ir;
+  PADDLE_ENFORCE(config.ir_mode ==
+                 AnalysisConfig::IrPassMode::kExclude);  // default
+  config.ir_passes.clear();  // Do not exclude any pass.
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;

  auto base_predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
  auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(config);
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
  std::vector<PaddleTensor> input_slots;
-  DataRecord data(data_path, batch_size);
+  DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
  // Prepare inputs.
  PrepareInputs(&input_slots, &data, batch_size);
  std::vector<PaddleTensor> outputs, base_outputs;

  base_predictor->Run(input_slots, &base_outputs);

-  Timer timer;
-  timer.tic();
-  for (int i = 0; i < num_times; i++) {
-    predictor->Run(input_slots, &outputs);
-  }
-  LOG(INFO) << "===========profile result===========";
-  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
-            << ", latency: " << timer.toc() / num_times << "ms";
-  LOG(INFO) << "=====================================";
-
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    auto &base_out = base_outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
-                                   1, [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_EQ(size, size1);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    float *base_data = static_cast<float *>(base_out.data.data());
-    for (size_t j = 0; j < size; j++) {
-      EXPECT_NEAR(data[j], base_data[j], 1e-3);
+  if (num_threads == 1) {
+    // Prepare inputs.
+    Timer timer;
+    timer.tic();
+    for (int i = 0; i < num_times; i++) {
+      predictor->Run(input_slots, &outputs);
+    }
+    PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
+    CompareResult(outputs, base_outputs);
+  } else {
+    std::vector<std::thread> threads;
+    std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+    // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
+    // because AttentionLSTM's hard code nodeid will be damanged.
+    for (int tid = 0; tid < num_threads; ++tid) {
+      predictors.emplace_back(
+          CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+              config));
+    }
+    for (int tid = 0; tid < num_threads; ++tid) {
+      threads.emplace_back([&, tid]() {
+        // Each thread should have local input_slots and outputs.
+        std::vector<PaddleTensor> input_slots;
+        DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
+        PrepareInputs(&input_slots, &data, batch_size);
+        std::vector<PaddleTensor> outputs;
+        Timer timer;
+        timer.tic();
+        for (int i = 0; i < num_times; i++) {
+          predictors[tid]->Run(input_slots, &outputs);
+        }
+        PrintTime(batch_size, num_times, num_threads, tid,
+                  timer.toc() / num_times);
+        CompareResult(outputs, base_outputs);
+      });
+    }
+    for (int i = 0; i < num_threads; ++i) {
+      threads[i].join();
    }
  }

@@ -345,25 +351,26 @@ void TestDituRNNPrediction(const std::string &model_path,
  }
 }

-// Directly infer with the original model.
-TEST(Analyzer, DituRNN_without_analysis) {
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, false, false, FLAGS_repeat);
+// Inference with analysis and IR, easy for profiling independently.
+TEST(Analyzer, DituRNN) {
+  TestDituRNNPrediction(true, true, FLAGS_num_threads);
 }

-// Inference with the original model with the analysis turned on, the analysis
-// module will transform the program to a data flow graph.
-TEST(Analyzer, DituRNN_with_analysis) {
-  LOG(INFO) << "ditu rnn with analysis";
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, true, false, FLAGS_repeat);
-}
-
-// Inference with analysis and IR. The IR module will fuse some large kernels.
-TEST(Analyzer, DituRNN_with_analysis_with_IR) {
-  LOG(INFO) << "ditu rnn with analysis and IR fuse";
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, true, true, FLAGS_repeat);
+// Other unit-tests of DituRNN, test different options of use_analysis,
+// activate_ir and multi-threads.
+TEST(Analyzer, DituRNN_tests) {
+  int num_threads[2] = {1, 4};
+  for (auto i : num_threads) {
+    // Directly infer with the original model.
+    TestDituRNNPrediction(false, false, i);
+    // Inference with the original model with the analysis turned on, the
+    // analysis
+    // module will transform the program to a data flow graph.
+    TestDituRNNPrediction(true, false, i);
+    // Inference with analysis and IR. The IR module will fuse some large
+    // kernels.
+    TestDituRNNPrediction(true, true, i);
+  }
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
+#include <gtest/gtest.h>
+#include <fstream>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/api/timer.h"
+
+DEFINE_string(infer_model, "", "Directory of the inference model.");
+DEFINE_string(infer_data, "", "Path of the dataset.");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(repeat, 1, "How many times to repeat run.");
+DEFINE_int32(topn, -1, "Run top n batches of data to save time");
+
+namespace paddle {
+namespace inference {
+
+struct DataReader {
+  explicit DataReader(const std::string &path)
+      : file(new std::ifstream(path)) {}
+
+  bool NextBatch(PaddleTensor *tensor, int batch_size) {
+    PADDLE_ENFORCE_EQ(batch_size, 1);
+    std::string line;
+    tensor->lod.clear();
+    tensor->lod.emplace_back(std::vector<size_t>({0}));
+    std::vector<int64_t> data;
+
+    for (int i = 0; i < batch_size; i++) {
+      if (!std::getline(*file, line)) return false;
+      inference::split_to_int64(line, ' ', &data);
+    }
+    tensor->lod.front().push_back(data.size());
+
+    tensor->data.Resize(data.size() * sizeof(int64_t));
+    memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t));
+    tensor->shape.clear();
+    tensor->shape.push_back(data.size());
+    tensor->shape.push_back(1);
+    return true;
+  }
+
+  std::unique_ptr<std::ifstream> file;
+};
+
+void Main(int batch_size) {
+  // shape --
+  // Create Predictor --
+  AnalysisConfig config;
+  config.model_dir = FLAGS_infer_model;
+  config.use_gpu = false;
+  config.enable_ir_optim = true;
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+
+  std::vector<PaddleTensor> input_slots(1);
+  // one batch starts
+  // data --
+  auto &input = input_slots[0];
+  input.dtype = PaddleDType::INT64;
+
+  inference::Timer timer;
+  double sum = 0;
+  std::vector<PaddleTensor> output_slots;
+
+  int num_batches = 0;
+  for (int t = 0; t < FLAGS_repeat; t++) {
+    DataReader reader(FLAGS_infer_data);
+    while (reader.NextBatch(&input, FLAGS_batch_size)) {
+      if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break;
+      timer.tic();
+      CHECK(predictor->Run(input_slots, &output_slots));
+      sum += timer.toc();
+      ++num_batches;
+    }
+  }
+  PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
+
+  // Get output
+  LOG(INFO) << "get outputs " << output_slots.size();
+
+  for (auto &output : output_slots) {
+    LOG(INFO) << "output.shape: " << to_string(output.shape);
+    // no lod ?
+    CHECK_EQ(output.lod.size(), 0UL);
+    LOG(INFO) << "output.dtype: " << output.dtype;
+    std::stringstream ss;
+    for (int i = 0; i < 5; i++) {
+      ss << static_cast<float *>(output.data.data())[i] << " ";
+    }
+    LOG(INFO) << "output.data summary: " << ss.str();
+    // one batch ends
+  }
+}
+
+TEST(text_classification, basic) { Main(FLAGS_batch_size); }
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -263,7 +263,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }  // namespace

-Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
      FLAGS_IA_graphviz_log_root,
      "data_flow_graph_to_fluid_graphviz_debugger"));

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -21,8 +21,8 @@

 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/pass.h"

 namespace paddle {
 namespace inference {
@@ -42,7 +42,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
    return "Transform a DFG to a Fluid ProgramDesc";
  }

-  Pass *CreateGraphvizDebugerPass() const override;
+  AnalysisPass *CreateGraphvizDebugerPass() const override;

 protected:
  // Add a Fluid Op into the ProgramDesc.

--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -21,8 +21,8 @@ limitations under the License. */

 #include <fstream>
 #include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/dot.h"
-#include "paddle/fluid/inference/analysis/pass.h"

 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/analysis/flags.h
+++ b/paddle/fluid/inference/analysis/flags.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+
+// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
+// flag if not available.
+DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
+DECLARE_string(IA_graphviz_log_root);
+DECLARE_string(IA_output_storage_path);
+DECLARE_bool(IA_enable_ir);
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -66,7 +66,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }

-Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
      FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
 }

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -22,8 +22,8 @@
 #include <string>

 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/pass.h"

 namespace paddle {
 namespace inference {
@@ -46,7 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
    return "transform a fluid ProgramDesc to a data flow graph.";
  }

-  Pass *CreateGraphvizDebugerPass() const override;
+  AnalysisPass *CreateGraphvizDebugerPass() const override;

 private:
  framework::proto::ProgramDesc const *desc_;

--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
@@ -14,14 +14,17 @@

 #pragma once

+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
-#include "paddle/fluid/inference/analysis/pass.h"

 namespace paddle {
 namespace inference {
 namespace analysis {
-using namespace framework;

 static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__";

@@ -47,7 +50,8 @@ class FluidToIrPass final : public DataFlowGraphPass {
    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
    // Load program.
    auto program = LoadProgramDesc(*argument->fluid_model_program_path);
-    argument->origin_program_desc.reset(new proto::ProgramDesc(program));
+    argument->origin_program_desc.reset(
+        new framework::proto::ProgramDesc(program));
    // Create main data flow graph.
    if (!argument->main_dfg) {
      argument->main_dfg.reset(new DataFlowGraph);
@@ -77,27 +81,30 @@ class FluidToIrPass final : public DataFlowGraphPass {
    IRPassManager ir_passes(argument_->Get<ProgramDesc>("ir_program_desc"),
                            nullptr);
    // Pass the scope from analysis to IR if needed.
-    if (argument_->Has(ir::kParamScopeAttr)) {
+    if (argument_->Has(framework::ir::kParamScopeAttr)) {
      // Here the address is passed, attention that IR doesn't own the scope, so
      // the real scope in analysis should live during the IR phase.
      ir_passes.graph().Set(
-          ir::kParamScopeAttr,
-          new Scope *(&argument_->Get<Scope>(ir::kParamScopeAttr)));
+          framework::ir::kParamScopeAttr,
+          new framework::Scope *(&argument_->Get<framework::Scope>(
+              framework::ir::kParamScopeAttr)));
    }

-    const auto &ir_passes_to_apply =
-        argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
-    ir_passes.Apply(ir_passes_to_apply);
+    if (FLAGS_IA_enable_ir) {
+      const auto &ir_passes_to_apply =
+          argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
+      ir_passes.Apply(ir_passes_to_apply);
+    }

    PADDLE_ENFORCE(argument_->main_dfg.get());
    argument_->main_dfg->Build(ir_passes.graph());
    // inherit the arguments from ir.
-    if (ir_passes.graph().Has(ir::kFuseStatisAttr)) {
+    if (ir_passes.graph().Has(framework::ir::kFuseStatisAttr)) {
      argument_->Set(
-          ir::kFuseStatisAttr,
+          framework::ir::kFuseStatisAttr,
          new std::unordered_map<std::string, int>(
              ir_passes.graph().Get<std::unordered_map<std::string, int>>(
-                  ir::kFuseStatisAttr)));
+                  framework::ir::kFuseStatisAttr)));
    }
  }

@@ -109,7 +116,7 @@ class FluidToIrPass final : public DataFlowGraphPass {

 private:
  // Load parameters from a single file or from a directory.
-  bool LoadParams(Scope *scope, const std::string &dir,
+  bool LoadParams(framework::Scope *scope, const std::string &dir,
                  const std::string &prog_file, const std::string &param_file);

 private:

--- a/paddle/fluid/inference/analysis/model_store_pass.h
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -19,7 +19,7 @@

 #pragma once
 #include <string>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"

 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -40,17 +40,6 @@ void DfgPassManager::RunAll() {
  }
 }

-void NodePassManager::RunAll() {
-  PADDLE_ENFORCE(argument_);
-  PADDLE_ENFORCE(argument_->main_dfg.get());
-  auto trait = GraphTraits<DataFlowGraph>(*argument_->main_dfg).nodes_in_DFS();
-  for (auto& node : trait) {
-    for (auto& pass : data_) {
-      pass->Run(&node);
-    }
-  }
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ b/paddle/fluid/inference/analysis/pass_manager.h
@@ -33,7 +33,7 @@ limitations under the License. */

 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"

 namespace paddle {
 namespace inference {
@@ -43,7 +43,7 @@ namespace analysis {
 * PassManager is the base class for all pass managers, a pass manager has
 * several Pass-es registered, and execute them in the linear order.
 */
-class PassManager : public OrderedRegistry<Pass> {
+class PassManager : public OrderedRegistry<AnalysisPass> {
 public:
  PassManager() = default;
  // Call all the passes' Initialize methods. The desc and data_flow_graph are
@@ -89,18 +89,6 @@ class DfgPassManager : public PassManager {
  virtual ~DfgPassManager() = default;
 };

-/*
- * A pass manager that process a Node each time.
- */
-class NodePassManager : public PassManager {
- public:
-  NodePassManager() = default;
-
-  void RunAll() override;
-
-  virtual ~NodePassManager() = default;
-};
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -34,28 +34,6 @@ class TestDfgPassManager final : public DfgPassManager {
  std::string description() const override { return "test doc"; }
 };

-class TestNodePassManager final : public NodePassManager {
- public:
-  virtual ~TestNodePassManager() = default;
-
-  std::string repr() const override { return "test-node-pass-manager"; }
-  std::string description() const override { return "test doc"; }
-};
-
-class TestNodePass final : public NodePass {
- public:
-  virtual ~TestNodePass() = default;
-
-  bool Initialize(Argument* argument) override { return true; }
-
-  void Run(Node* node) override {
-    LOG(INFO) << "- Processing node " << node->repr();
-  }
-
-  std::string repr() const override { return "test-node"; }
-  std::string description() const override { return "some doc"; }
-};
-
 TEST(PassManager, DFG_pass_manager) {
  TestDfgPassManager manager;
  DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
@@ -71,19 +49,6 @@ TEST(PassManager, DFG_pass_manager) {
  manager.RunAll();
 }

-TEST(PassManager, Node_pass_manager) {
-  Argument argument(FLAGS_inference_model_dir);
-  // Pre-process: initialize the DFG with the ProgramDesc first.
-  FluidToDataFlowGraphPass pass0;
-  pass0.Initialize(&argument);
-  pass0.Run(argument.main_dfg.get());
-
-  TestNodePassManager manager;
-  manager.Register("test-node-pass", new TestNodePass);
-  ASSERT_TRUE(manager.Initialize(&argument));
-  manager.RunAll();
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -68,7 +68,7 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass {
  }
 };

-Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
  DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
                                      "tensorrt_marked_node");
  return new DfgDebuggerPass(config);

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
@@ -20,7 +20,7 @@
 #pragma once

 #include <string>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"

 namespace paddle {
@@ -48,7 +48,7 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
    return "tensorrt sub-graph mark pass";
  }

-  Pass* CreateGraphvizDebugerPass() const override;
+  AnalysisPass* CreateGraphvizDebugerPass() const override;
  bool Finalize() override;

 private:

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once

 #include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/node.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"

 namespace paddle {

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -44,8 +44,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)

 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api)
-
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
 cc_test(test_paddle_inference_api
        SRCS api_tester.cc
        DEPS paddle_inference_api)
@@ -61,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
 inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()

-if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
+if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
    # compile the libinference_anakin_api.a and anakin.so.
    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
@@ -71,12 +70,24 @@ if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
    anakin_target(inference_anakin_api)
    anakin_target(inference_anakin_api_shared)
    if (WITH_TESTING)
-        cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
-                ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
-                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+        # TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
+        set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
+        set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
+        set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
+        execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
+        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
+        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
+        if(WITH_GPU)
+            set(anakin_test_extra_deps dynload_cuda)
+            set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
+            execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
+            cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
+                    ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
+                    DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
+        endif()
        cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc 
                ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
                     --datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
-                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+                DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
    endif(WITH_TESTING)
 endif()
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -14,24 +14,40 @@

 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DECLARE_bool(profile);

 namespace paddle {

 bool AnalysisPredictor::Init(
    const std::shared_ptr<framework::Scope>& parent_scope) {
  VLOG(3) << "Predictor::init()";
+#if !defined(_WIN32)
+  if (FLAGS_profile) {
+    LOG(WARNING) << "Profiler is actived, might affect the performance";
+    LOG(INFO) << "You can turn off by set gflags '-profile false'";
+    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
+                                           : platform::ProfilerState::kCPU;
+    platform::EnableProfiler(tracking_device);
+  }
+#endif
+
  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
+    LOG(WARNING) << "ir optimize only supports CPU currently";
+    config_.enable_ir_optim = false;
  } else {
    place_ = paddle::platform::CPUPlace();
  }
-  PADDLE_ENFORCE(!parent_scope);
  if (parent_scope) {
    scope_ = parent_scope;
    sub_scope_ = &(parent_scope->NewScope());
@@ -73,7 +89,7 @@ bool AnalysisPredictor::Init(

 void AnalysisPredictor::OptimizeInferenceProgram() {
  LOG(INFO) << "optimize begin";
-  FLAGS_IA_enable_ir = true;
+  FLAGS_IA_enable_ir = config_.enable_ir_optim;
  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
  FLAGS_IA_output_storage_path = "";  // Don't output the model.
  // Analyze inference_program
@@ -90,24 +106,26 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  }
  argument_.origin_program_desc.reset(
      new ProgramDesc(*inference_program_->Proto()));
-  Analyzer().Run(&argument_);
+  PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude,
+                 "Only kExclude is supported yet.");
+  Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
+
  CHECK(argument_.transformed_program_desc);
  VLOG(5) << "to prepare executor";
-  // LOG(INFO) << "transformed_parogram_desc " <<
-  // argument.transformed_program_desc->DebugString();
  inference_program_.reset(
      new framework::ProgramDesc(*argument_.transformed_program_desc));
-  PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr));
-  // Update scope.
-  scope_.reset(
-      argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
-  LOG(INFO) << "optimize end ==";
+  if (argument_.Has(framework::ir::kParamScopeAttr)) {
+    // Update scope.
+    scope_.reset(
+        argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
+  }
+  LOG(INFO) << "== optimize end ==";
 }

 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) {
-  VLOG(3) << "create NativePredictor";
+    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) {
+  VLOG(3) << "create AnalysisConfig";
  if (config.use_gpu) {
    // 1. GPU memeroy
    PADDLE_ENFORCE_GT(

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <string>
+#include <vector>
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -28,7 +30,7 @@ using framework::proto::ProgramDesc;
 */
 class AnalysisPredictor : public NativePaddlePredictor {
 public:
-  explicit AnalysisPredictor(const NativeConfig& config)
+  explicit AnalysisPredictor(const AnalysisConfig& config)
      : NativePaddlePredictor(config), config_(config) {}

  bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
@@ -44,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor {
  Argument& analysis_argument() { return argument_; }

 private:
-  NativeConfig config_;
+  AnalysisConfig config_;
  Argument argument_;
 };


--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -193,7 +193,9 @@ PaddleInferenceAnakinPredictor<Target>::Clone() {
  return std::move(cls);
 }

+#ifdef PADDLE_WITH_CUDA
 template class PaddleInferenceAnakinPredictor<anakin::NV>;
+#endif
 template class PaddleInferenceAnakinPredictor<anakin::X86>;

 // A factory to help create difference predictor.
@@ -202,10 +204,15 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
    AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
  VLOG(3) << "Anakin Predictor create.";
  if (config.target_type == AnakinConfig::NVGPU) {
+#ifdef PADDLE_WITH_CUDA
    VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
    std::unique_ptr<PaddlePredictor> x(
        new PaddleInferenceAnakinPredictor<anakin::NV>(config));
    return x;
+#else
+    LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment";
+    return nullptr;
+#endif
  } else if (config.target_type == AnakinConfig::X86) {
    VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
    std::unique_ptr<PaddlePredictor> x(

--- a/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
@@ -20,71 +20,16 @@ limitations under the License. */
 #include <iostream>
 #include <thread>  // NOLINT
 #include <vector>
-#include "framework/core/net/net.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/timer.h"
+#include "utils/logger/logger.h"

 DEFINE_string(model, "", "Directory of the inference model.");
 DEFINE_string(datapath, "", "Path of the dataset.");
 DEFINE_int32(batch_size, 1, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");

-// Timer for timer
-class Timer {
- public:
-  double start;
-  double startu;
-  void tic() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    start = tp.tv_sec;
-    startu = tp.tv_usec;
-  }
-  double toc() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    double used_time_ms =
-        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
-    return used_time_ms;
-  }
-};
-
-std::vector<std::string> string_split(std::string in_str,
-                                      std::string delimiter) {
-  std::vector<std::string> seq;
-  int found = in_str.find(delimiter);
-  int pre_found = -1;
-  while (found != std::string::npos) {
-    if (pre_found == -1) {
-      seq.push_back(in_str.substr(0, found));
-    } else {
-      seq.push_back(in_str.substr(pre_found + delimiter.length(),
-                                  found - delimiter.length() - pre_found));
-    }
-    pre_found = found;
-    found = in_str.find(delimiter, pre_found + delimiter.length());
-  }
-  seq.push_back(
-      in_str.substr(pre_found + 1, in_str.length() - (pre_found + 1)));
-  return seq;
-}
-std::vector<std::string> string_split(
-    std::string in_str, std::vector<std::string>& delimiter) {  // NOLINT
-  std::vector<std::string> in;
-  std::vector<std::string> out;
-  out.push_back(in_str);
-  for (auto del : delimiter) {
-    in = out;
-    out.clear();
-    for (auto s : in) {
-      auto out_s = string_split(s, del);
-      for (auto o : out_s) {
-        out.push_back(o);
-      }
-    }
-  }
-  return out;
-}
-
 class Data {
 public:
  Data(std::string file_name, int batch_size)
@@ -120,36 +65,24 @@ void Data::get_batch_data(
  week_fea.clear();
  time_fea.clear();
  while (_file.getline(buf, 10000)) {
-    std::string s = buf;
-    std::vector<std::string> deli_vec = {":"};
-    std::vector<std::string> data_vec = string_split(s, deli_vec);
+    std::vector<std::string> data_vec;
+    paddle::inference::split(buf, ':', &data_vec);

    std::vector<std::string> seq;
-    seq = string_split(data_vec[0], {"|"});
+    paddle::inference::split(data_vec[0], '|', &seq);

    for (auto link : seq) {
-      std::vector<std::string> data = string_split(link, ",");
      std::vector<float> vec;
-      for (int i = 0; i < data.size(); i++) {
-        vec.push_back(atof(data[i].c_str()));
-      }
+      paddle::inference::split_to_float(link, ',', &vec);
      fea.push_back(vec);
    }
-    std::vector<std::string> week_data;
-    std::vector<std::string> time_data;

-    week_data = string_split(data_vec[2], ",");
    std::vector<float> vec_w;
-    for (int i = 0; i < week_data.size(); i++) {
-      vec_w.push_back(atof(week_data[i].c_str()));
-    }
+    paddle::inference::split_to_float(data_vec[2], ',', &vec_w);
    week_fea.push_back(vec_w);

-    time_data = string_split(data_vec[1], ",");
    std::vector<float> vec_t;
-    for (int i = 0; i < time_data.size(); i++) {
-      vec_t.push_back(atof(time_data[i].c_str()));
-    }
+    paddle::inference::split_to_float(data_vec[1], ',', &vec_t);
    time_fea.push_back(vec_t);

    cum += seq.size();
@@ -275,14 +208,13 @@ void single_test() {
    inputs.push_back(tensor_2);
    inputs.push_back(tensor_0);

-    Timer timer;
+    paddle::inference::Timer timer;
    timer.tic();
    for (int i = 0; i < FLAGS_repeat; i++) predictor->Run(inputs, &outputs);

-    LOG(INFO) << "batch_size = " << FLAGS_batch_size
-              << ", repeat = " << FLAGS_repeat
-              << ", sequence_length = " << seq_offset[seq_offset.size() - 1]
-              << ", latency: " << timer.toc() / FLAGS_repeat << "ms";
+    paddle::inference::PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0,
+                                 timer.toc() / FLAGS_repeat);
+    LOG(INFO) << "sequence_length = " << seq_offset[seq_offset.size() - 1];

    float* data_o = static_cast<float*>(outputs[0].data.data());
    VLOG(3) << "outputs[0].data.length() = " << outputs[0].data.length();

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -176,7 +176,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                    framework::Scope *scope) {
  VLOG(3) << "Predictor::set_feed";
  if (inputs.size() != feeds_.size()) {
-    LOG(ERROR) << "wrong feed input size.";
+    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
+               << inputs.size();
    return false;
  }
  for (size_t i = 0; i < inputs.size(); ++i) {

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <glog/logging.h>
 #include <sys/time.h>
 #include <algorithm>
 #include <numeric>
@@ -88,5 +89,45 @@ static void TensorAssignData(PaddleTensor *tensor,
  }
 }

+std::string DescribeTensor(const PaddleTensor &tensor) {
+  std::stringstream os;
+  os << "Tensor [" << tensor.name << "]\n";
+  os << " - type: ";
+  switch (tensor.dtype) {
+    case PaddleDType::FLOAT32:
+      os << "float32";
+      break;
+    case PaddleDType::INT64:
+      os << "int64";
+      break;
+    default:
+      os << "unset";
+  }
+  os << '\n';
+
+  os << " - shape: " << to_string(tensor.shape) << '\n';
+  os << " - lod: ";
+  for (auto &l : tensor.lod) {
+    os << to_string(l) << "; ";
+  }
+  os << "\n";
+  os << " - data: ";
+
+  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  for (int i = 0; i < dim; i++) {
+    os << static_cast<float *>(tensor.data.data())[i] << " ";
+  }
+  os << '\n';
+  return os.str();
+}
+
+void PrintTime(int batch_size, int repeat, int num_threads, int tid,
+               double latency) {
+  LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
+            << ", threads: " << num_threads << ", thread id: " << tid
+            << ", latency: " << latency << "ms ======";
+}
+
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -150,6 +150,21 @@ struct TensorRTConfig : public NativeConfig {
  int workspace_size{1 << 30};
 };

+// NOTE WIP, not stable yet.
+struct AnalysisConfig : public NativeConfig {
+  //
+  enum class IrPassMode {
+    kSystem,   // Use system default passes, not customize.
+    kInclude,  // Specify the passes in `ir_passes`.
+    kExclude   // Specify the disabled passes in `ir_passes`.
+  };
+
+  bool enable_ir_optim = true;
+  IrPassMode ir_mode{IrPassMode::kExclude};
+  // attention lstm fuse works only on some specific models, disable as default.
+  std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
+};
+
 // A factory to help create different predictors.
 //
 // FOR EXTENSION DEVELOPER:

--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/auc_op.h"
-#include <string>

 namespace paddle {
 namespace operators {
@@ -36,15 +35,12 @@ class AucOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(predict_height, label_height,
                      "Out and Label should have same height.");

-    int num_thres = ctx->Attrs().Get<int>("num_thresholds");
+    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;

    ctx->SetOutputDim("AUC", {1});
-    ctx->SetOutputDim("TPOut", {num_thres});
-    ctx->SetOutputDim("TNOut", {num_thres});
-    ctx->SetOutputDim("FPOut", {num_thres});
-    ctx->SetOutputDim("FNOut", {num_thres});
-
-    ctx->ShareLoD("Predict", /*->*/ "AUC");
+    ctx->SetOutputDim("BatchAUC", {1});
+    ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
+    ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
  }

 protected:
@@ -66,25 +62,24 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label",
             "A 2D int tensor indicating the label of the training data. "
             "shape: [batch_size, 1]");
-    AddInput("TP", "True-Positive value.");
-    AddInput("FP", "False-Positive value.");
-    AddInput("TN", "True-Negative value.");
-    AddInput("FN", "False-Negative value.");
    // TODO(typhoonzero): support weight input
+    AddInput("StatPos", "Statistic value when label = 1");
+    AddInput("StatNeg", "Statistic value when label = 0");
+
    AddOutput("AUC",
              "A scalar representing the "
              "current area-under-the-curve.");
-    AddOutput("TPOut", "True-Positive value.");
-    AddOutput("FPOut", "False-Positive value.");
-    AddOutput("TNOut", "True-Negative value.");
-    AddOutput("FNOut", "False-Negative value.");
+    AddOutput("BatchAUC", "The AUC for current batch");
+    AddOutput("StatPosOut", "Statistic value when label = 1");
+    AddOutput("StatNegOut", "Statistic value when label = 0");

    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
        .SetDefault("ROC");
+
    AddAttr<int>("num_thresholds",
                 "The number of thresholds to use when discretizing the"
                 " roc curve.")
-        .SetDefault(200);
+        .SetDefault((2 << 12) - 1);

    AddComment(R"DOC(
 Area Under The Curve (AUC) Operator.

--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"

 namespace paddle {
@@ -23,106 +23,85 @@ namespace operators {

 using Tensor = framework::Tensor;

-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
 template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* predict = ctx.Input<Tensor>("Predict");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* auc = ctx.Output<Tensor>("AUC");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *predict = ctx.Input<Tensor>("Predict");
+    auto *label = ctx.Input<Tensor>("Label");
+
+    std::string curve = ctx.Attr<std::string>("curve");
+    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    int num_pred_buckets = num_thresholds + 1;
+
    // Only use output var for now, make sure it's persistable and
    // not cleaned up for each batch.
-    auto* true_positive = ctx.Output<Tensor>("TPOut");
-    auto* false_positive = ctx.Output<Tensor>("FPOut");
-    auto* true_negative = ctx.Output<Tensor>("TNOut");
-    auto* false_negative = ctx.Output<Tensor>("FNOut");
+    auto *auc = ctx.Output<Tensor>("AUC");
+    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
+    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");

-    auto* auc_data = auc->mutable_data<double>(ctx.GetPlace());
+    auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
+    auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
+    calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
+            auc);

-    std::string curve = ctx.Attr<std::string>("curve");
-    int num_thresholds = ctx.Attr<int>("num_thresholds");
-    std::vector<double> thresholds_list;
-    thresholds_list.reserve(num_thresholds);
-    for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = static_cast<double>(i) / (num_thresholds - 1);
-    }
-    const double kEpsilon = 1e-7;
-    thresholds_list[0] = 0.0f - kEpsilon;
-    thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
+    auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
+    std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
+    std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
+    calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
+            num_thresholds, batch_auc);
+  }

+ private:
+  inline static double trapezoidArea(double X1, double X2, double Y1,
+                                     double Y2) {
+    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+  }
+
+  inline static void calcAuc(const framework::ExecutionContext &ctx,
+                             const framework::Tensor *label,
+                             const framework::Tensor *predict,
+                             int64_t *stat_pos, int64_t *stat_neg,
+                             int num_thresholds,
+                             framework::Tensor *auc_tensor) {
    size_t batch_size = predict->dims()[0];
    size_t inference_width = predict->dims()[1];
+    const T *inference_data = predict->data<T>();
+    const auto *label_data = label->data<int64_t>();
+
+    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());

-    const T* inference_data = predict->data<T>();
-    const auto* label_data = label->data<int64_t>();
-
-    auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
-    auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
-    auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
-    auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace());
-
-    for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
-      // calculate TP, FN, TN, FP for current thresh
-      int64_t tp = 0, fn = 0, tn = 0, fp = 0;
-      for (size_t i = 0; i < batch_size; i++) {
-        // NOTE: label_data used as bool, labels > 0 will be treated as true.
-        if (label_data[i]) {
-          if (inference_data[i * inference_width + 1] >=
-              (thresholds_list[idx_thresh])) {
-            tp++;
-          } else {
-            fn++;
-          }
-        } else {
-          if (inference_data[i * inference_width + 1] >=
-              (thresholds_list[idx_thresh])) {
-            fp++;
-          } else {
-            tn++;
-          }
-        }
+    for (size_t i = 0; i < batch_size; i++) {
+      uint32_t binIdx = static_cast<uint32_t>(
+          inference_data[i * inference_width + 1] * num_thresholds);
+      if (label_data[i]) {
+        stat_pos[binIdx] += 1.0;
+      } else {
+        stat_neg[binIdx] += 1.0;
      }
-      // store rates
-      tp_data[idx_thresh] += tp;
-      fn_data[idx_thresh] += fn;
-      tn_data[idx_thresh] += tn;
-      fp_data[idx_thresh] += fp;
    }
-    // epsilon to avoid divide by zero.
-    double epsilon = 1e-6;
-    // Riemann sum to caculate auc.
-    Tensor tp_rate, fp_rate, rec_rate;
-    tp_rate.Resize({num_thresholds});
-    fp_rate.Resize({num_thresholds});
-    rec_rate.Resize({num_thresholds});
-    auto* tp_rate_data = tp_rate.mutable_data<double>(ctx.GetPlace());
-    auto* fp_rate_data = fp_rate.mutable_data<double>(ctx.GetPlace());
-    auto* rec_rate_data = rec_rate.mutable_data<double>(ctx.GetPlace());
-    for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
-                        (tp_data[i] + fn_data[i] + epsilon);
-      fp_rate_data[i] =
-          static_cast<double>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
-      rec_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
-                         (tp_data[i] + fp_data[i] + epsilon);
+
+    *auc = 0.0f;
+
+    double totPos = 0.0;
+    double totNeg = 0.0;
+    double totPosPrev = 0.0;
+    double totNegPrev = 0.0;
+
+    int idx = num_thresholds;
+
+    while (idx >= 0) {
+      totPosPrev = totPos;
+      totNegPrev = totNeg;
+      totPos += stat_pos[idx];
+      totNeg += stat_neg[idx];
+      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
+
+      --idx;
    }
-    *auc_data = 0.0f;
-    if (curve == "ROC") {
-      for (int i = 0; i < num_thresholds - 1; i++) {
-        auto dx = fp_rate_data[i] - fp_rate_data[i + 1];
-        auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
-      }
-    } else if (curve == "PR") {
-      for (int i = 1; i < num_thresholds; i++) {
-        auto dx = tp_rate_data[i] - tp_rate_data[i - 1];
-        auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
-      }
+
+    if (totPos > 0.0 && totNeg > 0.0) {
+      *auc = *auc / totPos / totNeg;
    }
  }
 };

--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -119,7 +119,8 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
                  const framework::Tensor& last_scale,
                  const framework::Tensor& iter, const int window_size,
                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
-    auto& gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+
    T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);


--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -30,14 +30,7 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                 "Input(WeightX) of GRU should not be null.");
  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
                 "Input(WeightH) of GRU should not be null.");
-
  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                 "Output(ReorderedH0) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                 "Output(BatchedInput) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
-                 "Output(BatchedOut) of GRU should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                 "Output(Hidden) of GRU should not be null.");

@@ -80,15 +73,20 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
  }
  framework::DDim out_dims({x_dims[0], frame_size});
  ctx->SetOutputDim("Hidden", out_dims);
-  ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchedOut", out_dims);
  ctx->ShareLoD("X", "Hidden");
-
  int xx_width;
  if (ctx->Attrs().Get<bool>("use_seq")) {
    xx_width = wx_dims[1];
  } else {
    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                   "Output(ReorderedH0) of GRU should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                   "Output(BatchedInput) of GRU should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
+                   "Output(BatchedOut) of GRU should not be null.");
+    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedOut", out_dims);
  }
  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
  ctx->ShareLoD("X", "XX");

--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -67,27 +67,27 @@ template <typename T, int BlockDim>
 __global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
                                 T *y, T *mean, T *var, float epsilon,
                                 int feature_size) {
-  using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
+  using BlockReduce = cub::BlockReduce<PairForLayerNorm<double>, BlockDim>;
  __shared__ typename BlockReduce::TempStorage temp_storage;

  int beg_idx = blockIdx.x * feature_size + threadIdx.x;
  int end_idx = (blockIdx.x + 1) * feature_size;

  // Step 1: Reduce to calculate mean and var
-  T mean_val = static_cast<T>(0);
-  T var_val = static_cast<T>(0);
+  double mean_val = 0;
+  double var_val = 0;
  for (int i = beg_idx; i < end_idx; i += BlockDim) {
    T tmp = x[i];
    mean_val += tmp;
    var_val += (tmp * tmp);
  }
  auto pair = BlockReduce(temp_storage)
-                  .Reduce(PairForLayerNorm<T>(mean_val, var_val),
-                          PairForLayerNormAddFunctor<T>());
+                  .Reduce(PairForLayerNorm<double>(mean_val, var_val),
+                          PairForLayerNormAddFunctor<double>());
  if (threadIdx.x == 0) {
    auto tmp = pair.first_ / feature_size;
-    mean[blockIdx.x] = tmp;
-    var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp;
+    mean[blockIdx.x] = static_cast<T>(tmp);
+    var[blockIdx.x] = static_cast<T>(pair.second_ / feature_size - tmp * tmp);
  }
  __syncthreads();
  mean_val = mean[blockIdx.x];

--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -57,7 +57,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
          memset(output + i * row_width, 0, row_width * sizeof(T));
        } else {
          PADDLE_ENFORCE_LT(ids[i], row_number);
-          PADDLE_ENFORCE_GE(ids[i], 0);
+          PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
          memcpy(output + i * row_width, table + ids[i] * row_width,
                 row_width * sizeof(T));
        }

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
@@ -21,3 +21,10 @@ REGISTER_OP_CUDA_KERNEL(
 REGISTER_OP_CUDA_KERNEL(
    transpose_grad,
    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(
+    transpose2,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py