diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
index 8298fcf95a5074bce9533e04d54dab79a1460286..b9eaca5ee6b487bb37bb954b3c606c3096d37aeb 100644
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -19,4 +19,4 @@ ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
 
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py dataset.py models/ /workspace/
+ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
index 1b0c7dce8bd6faab0c4c59caa1cbe337483cbd16..f40f3c129741f9b6e3654399a9110b065fec7d6c 100644
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -29,9 +29,11 @@ Currently supported `--model` argument include:
     You can choose to use GPU/CPU training. With GPU training, you can specify
     `--gpus <gpu_num>` to run multi GPU training.
 * Run distributed training with parameter servers:
+    * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
     * start parameter servers:
         ```bash
         PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
+        sleep 15
         ```
     * start trainers:
         ```bash
@@ -42,6 +44,16 @@ Currently supported `--model` argument include:
     PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
     ```
 
+## Prepare the RecordIO file to Achieve Better Performance
+
+Run the following command will generate RecordIO files like "mnist.recordio" under the path
+and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
+at any time using `fluid.batch`.
+
+```bash
+python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
+```
+
 ## Run Distributed Benchmark on Kubernetes Cluster
 
 You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 49f26255f315c3c368f42b367dfc6487ffa0deb5..62a05234c45ee4fe1dc21f5a74efc269227154db 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -38,10 +38,12 @@ def parse_args():
         default='resnet',
         help='The model to run benchmark with.')
     parser.add_argument(
-        '--batch_size', type=int, default=32, help='The minibatch size.')
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The batch size on each gpu.')
     parser.add_argument(
         '--learning_rate', type=float, default=0.001, help='The learning rate.')
-    # TODO(wuyi): add "--use_fake_data" option back.
     parser.add_argument(
         '--skip_batch_num',
         type=int,
@@ -49,7 +51,10 @@ def parse_args():
         help='The first num of minibatch num to skip, for better performance test'
     )
     parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
+        '--iterations',
+        type=int,
+        default=80,
+        help='The number of minibatches, set to -1 to run all batches.')
     parser.add_argument(
         '--pass_num', type=int, default=100, help='The number of passes.')
     parser.add_argument(
@@ -69,6 +74,7 @@ def parse_args():
         type=int,
         default=1,
         help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    # this option is available only for vgg and resnet.
     parser.add_argument(
         '--cpus',
         type=int,
@@ -78,7 +84,7 @@ def parse_args():
         '--data_set',
         type=str,
         default='flowers',
-        choices=['cifar10', 'flowers'],
+        choices=['cifar10', 'flowers', 'imagenet'],
         help='Optional dataset for benchmark.')
     parser.add_argument(
         '--infer_only', action='store_true', help='If set, run forward only.')
@@ -108,6 +114,16 @@ def parse_args():
         default='local',
         choices=['local', 'pserver', 'nccl2'],
         help='Choose parameter update method, can be local, pserver, nccl2.')
+    parser.add_argument(
+        '--use_reader_op',
+        action='store_true',
+        help='Whether to use reader op, and must specify the data path if set this to true.'
+    )
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the training recordio files.')
     args = parser.parse_args()
     return args
 
@@ -210,26 +226,50 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
     place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
     exe = fluid.Executor(place)
     exe.run(startup_prog)
-    feed_var_list = [
-        var for var in train_prog.global_block().vars.itervalues()
-        if var.is_data
-    ]
-    feeder = fluid.DataFeeder(feed_var_list, place)
+
+    if not args.use_reader_op:
+        feed_var_list = [
+            var for var in train_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
 
     iters, num_samples, start_time = 0, 0, time.time()
     for pass_id in range(args.pass_num):
         train_losses = []
-        for batch_id, data in enumerate(train_reader()):
+        if not args.use_reader_op:
+            reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+                if data == None:
+                    break
+            if iters == args.iterations:
+                break
             if iters == args.skip_batch_num:
                 start_time = time.time()
                 num_samples = 0
-            if iters == args.iterations:
-                break
-            loss = exe.run(train_prog,
-                           feed=feeder.feed(data),
-                           fetch_list=[avg_loss])
+
+            if args.use_reader_op:
+                try:
+                    loss = exe.run(train_prog, fetch_list=[avg_loss])
+                except fluid.core.EnforceNotMet as ex:
+                    break
+            else:
+                loss = exe.run(train_prog,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_loss])
             iters += 1
-            num_samples += len(data)
+            batch_id += 1
+            # FIXME(wuyi): For use_reader_op, if the current
+            # pass is not the last, the last batch of this pass
+            # is also equal to args.batch_size.
+            if args.use_reader_op:
+                num_samples += args.batch_size * args.gpus
+            else:
+                num_samples += len(data)
             train_losses.append(loss)
             print("Pass: %d, Iter: %d, Loss: %f\n" %
                   (pass_id, iters, np.mean(train_losses)))
@@ -250,10 +290,14 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
 def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                    batch_acc, args, train_prog, startup_prog, nccl_id_var,
                    num_trainers, trainer_id):
-    feed_var_list = [
-        var for var in train_prog.global_block().vars.itervalues()
-        if var.is_data
-    ]
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    if not args.use_reader_op:
+        feed_var_list = [
+            var for var in train_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
+
     # generate fake:
     if args.use_fake_data:
         for var in feed_var_list:
@@ -270,7 +314,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                        "value": 1.0,
                        "dtype": var.dtype})
 
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
     if nccl_id_var and trainer_id == 0:
         #FIXME(wuyi): wait other trainer to start listening
         time.sleep(30)
@@ -287,12 +330,21 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
         num_trainers=num_trainers,
         trainer_id=trainer_id)
 
-    feeder = fluid.DataFeeder(feed_var_list, place)
     for pass_id in range(args.pass_num):
         num_samples = 0
         iters = 0
         start_time = time.time()
-        for batch_id, data in enumerate(train_reader()):
+        if not args.use_reader_op:
+            reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+                if data == None:
+                    break
+            if iters == args.iterations:
+                break
             if args.profile and pass_id == 0 and batch_id == 5:
                 profiler.start_profiler("All")
             elif args.profile and pass_id == 0 and batch_id == 10:
@@ -301,19 +353,25 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
             if iters == args.skip_batch_num:
                 start_time = time.time()
                 num_samples = 0
-            if iters == args.iterations:
-                break
-            if args.use_fake_data:
-                loss, = exe.run([avg_loss.name])
+            if args.use_fake_data or args.use_reader_op:
+                try:
+                    loss, = exe.run([avg_loss.name])
+                except fluid.core.EnforceNotMet as ex:
+                    break
             else:
                 loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
             if args.update_method == "pserver":
                 exe.bcast_params()
-            num_samples += len(data)
+            if args.use_reader_op:
+                num_samples += args.batch_size * args.gpus
+            else:
+                num_samples += len(data)
             iters += 1
             if batch_id % 1 == 0:
                 print("Pass %d, batch %d, loss %s" %
                       (pass_id, batch_id, np.array(loss)))
+            batch_id += 1
+
         print_train_time(start_time, time.time(), num_samples)
         if not args.no_test and batch_acc:
             test_acc = test(startup_exe, infer_prog, test_reader, feeder,
diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
index 635b3373dd27b21f83afae10b1d24833b81d57eb..69541adf6b7e53fcc1ac9d3c82b5a60ca0a72879 100644
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -197,6 +197,8 @@ def lodtensor_to_ndarray(lod_tensor):
 
 
 def get_model(args):
+    if args.use_reader_op:
+        raise Exception("machine_translation do not support reader op for now.")
     embedding_dim = 512
     encoder_size = 512
     decoder_size = 512
@@ -221,7 +223,7 @@ def get_model(args):
     train_batch_generator = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
 
     test_batch_generator = paddle.batch(
         paddle.reader.shuffle(
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
index 28a38a931cf6cfcd5dd858b363b3d29b70368315..8e740dc6896b7eeeb82170aa13d32987c4df5c48 100644
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -20,6 +20,7 @@ import numpy as np
 import argparse
 import time
 import cProfile
+import os
 
 import paddle
 import paddle.fluid as fluid
@@ -65,9 +66,24 @@ def cnn_model(data):
 
 
 def get_model(args):
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1, 1, 28, 28], (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     if args.device == 'CPU' and args.cpus > 1:
         places = fluid.layers.get_places(args.cpus)
@@ -103,7 +119,7 @@ def get_model(args):
 
     # Reader
     train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
     test_reader = paddle.batch(
         paddle.dataset.mnist.test(), batch_size=args.batch_size)
     return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index f951f73a35dc4dc6f796178ebbc3e2886b2d7d8c..2ee2b5be09bfcc2e7fcec7eb2f80e28e4e75ab3d 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import functools
 import numpy as np
 import time
+import os
 
 import cProfile, pstats, StringIO
 
@@ -26,6 +27,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
+from recordio_converter import imagenet_train, imagenet_test
 
 
 def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
@@ -122,16 +124,48 @@ def get_model(args):
         else:
             dshape = [32, 32, 3]
         model = resnet_cifar10
-    else:
+        train_reader = paddle.dataset.cifar.train10()
+        test_reader = paddle.dataset.cifar.test10()
+    elif args.data_set == "flowers":
         class_dim = 102
         if args.data_format == 'NCHW':
             dshape = [3, 224, 224]
         else:
             dshape = [224, 224, 3]
         model = resnet_imagenet
-
-    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        train_reader = paddle.dataset.flowers.train()
+        test_reader = paddle.dataset.flowers.test()
+    elif args.data_set == "imagenet":
+        class_dim = 1000
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if not args.data_path:
+            raise Exception(
+                "Must specify --data_path when training with imagenet")
+        train_reader = imagenet_train(args.data_path)
+        test_reader = imagenet_test(args.data_path)
+
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + dshape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        input, label = fluid.layers.read_file(data_file)
+    else:
+        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     if args.device == 'CPU' and args.cpus > 1:
         places = fluid.layers.get_places(args.cpus)
@@ -162,15 +196,10 @@ def get_model(args):
 
     optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
 
-    train_reader = paddle.batch(
+    batched_train_reader = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=args.batch_size)
-
-    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
+            train_reader, buf_size=5120),
+        batch_size=args.batch_size * args.gpus)
+    batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size)
+
+    return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
index 1b680d76a8ba1ead7c8c50065e1817c45b951b27..e1c4857f1a365f6480929ea57296a9801f5ea945 100644
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size):
 
 
 def get_model(args):
+    if args.use_reader_op:
+        raise Exception(
+            "stacked_dynamic_lstm do not support reader op for now.")
     lstm_size = 512
     emb_dim = 512
     crop_size = 1500
@@ -114,7 +117,7 @@ def get_model(args):
     train_reader = batch(
         paddle.reader.shuffle(
             crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
     test_reader = batch(
         paddle.reader.shuffle(
             crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
index 53856c5f7acd3a4e1476ec57154a880bb6f984c9..6092cdeb884b3a9b60a3bcf20b022f2b0685e6aa 100644
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -22,6 +22,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse
 import functools
+import os
 
 
 def vgg16_bn_drop(input):
@@ -65,9 +66,24 @@ def get_model(args):
         else:
             data_shape = [224, 224, 3]
 
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + data_shape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
+        images = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     # Train program
     net = vgg16_bn_drop(images)
@@ -95,7 +111,7 @@ def get_model(args):
             paddle.dataset.cifar.train10()
             if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
             buf_size=5120),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
     test_reader = paddle.batch(
         paddle.dataset.cifar.test10()
         if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
diff --git a/benchmark/fluid/recordio_converter.py b/benchmark/fluid/recordio_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2dc39109bf1beaf147b046560c92fbd2416d8e6
--- /dev/null
+++ b/benchmark/fluid/recordio_converter.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.dataset import mnist, cifar, flowers, image
+
+
+def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
+                       shape_label):
+    num_batches = 0
+    with fluid.program_guard(fluid.Program(), fluid.Program()):
+        reader = paddle.batch(py_reader(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(
+            feed_list=[  # order is image and label
+                fluid.layers.data(
+                    name='image', shape=shape_data),
+                fluid.layers.data(
+                    name='label', shape=shape_label, dtype='int64'),
+            ],
+            place=fluid.CPUPlace())
+        num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+            outfilepath, reader, feeder)
+    return num_batches
+
+
+def prepare_mnist(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "mnist.recordio")
+    convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
+
+
+def prepare_cifar10(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "cifar.recordio")
+    convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
+
+
+def prepare_flowers(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "flowers.recordio")
+    convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
+                       [1])
+
+
+def default_mapper(sample):
+    img, label = sample
+    img = image.simple_transform(
+        img, 256, 224, True, mean=[103.94, 116.78, 123.68])
+    return img.flatten().astype('float32'), label
+
+
+def imagenet_train(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "train.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+    # shuffle all, this is slow
+    random.shuffle(imgfilelist)
+
+    def train_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            data = image.load_image(
+                os.path.join(data_dir, "train", imgfile.lower()))
+            label = [img2label[imgfile], ]
+            yield [data, label]
+
+    return paddle.reader.map_readers(default_mapper, train_reader)
+
+
+def imagenet_test(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "val.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+
+    def test_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
+            image_path = ".".join([base_path, "jpeg"])
+            data = image.load_image(image_path)
+            label = [img2label[imgfile], ]
+            yield [data, label]
+
+    return paddle.reader.map_readers(default_mapper, test_reader)
+
+
+# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
+def convert_reader_to_recordio_files(
+        filename,
+        batch_per_file,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    f_name, f_ext = os.path.splitext(filename)
+    assert (f_ext == ".recordio")
+
+    lines = []
+    f_idx = 0
+    counter = 0
+    for idx, batch in enumerate(reader_creator()):
+        lines.append(batch)
+        if idx >= batch_per_file and idx % batch_per_file == 0:
+            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
+            with fluid.recordio_writer.create_recordio_writer(
+                    filename, compressor, max_num_records) as writer:
+                for l in lines:
+                    res = feeder.feed(l)
+                    for each in feed_order:
+                        writer.append_tensor(res[each])
+                    writer.complete_append_tensor()
+                    counter += 1
+                lines = []
+                f_idx += 1
+            print("written file: ", filename)
+    return counter
+
+
+def prepare_imagenet(inpath, outpath, batch_size):
+    r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            fluid.layers.data(
+                name="image", shape=[3, 224, 224]), fluid.layers.data(
+                    name="label", shape=[1], dtype='int64')
+        ],
+        place=fluid.CPUPlace())
+    outpath = os.path.join(outpath, "imagenet.recordio")
+    convert_reader_to_recordio_files(outpath, 10000, r, feeder)
diff --git a/benchmark/fluid/run_fluid_benchmark.sh b/benchmark/fluid/run_fluid_benchmark.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4309a3126c1d72fe1eb2d5ec423075aea4d3ec88
--- /dev/null
+++ b/benchmark/fluid/run_fluid_benchmark.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 &
+
+sleep 15
+
+CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
+
+CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 682614742cf1bd3130c638020a2545e16226d4d6..4158d0528a1aea52c2a3f0880fe1000183a9df53 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -92,6 +92,9 @@ if(WITH_GPU)
         if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
             message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
         endif()
+        if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
+            message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
+        endif()
         include_directories(${TENSORRT_INCLUDE_DIR})
     endif()
 elseif(WITH_AMD_GPU)
diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst
index de7e9eb75c3a053179f2d03ac887955bb4e0a6d2..6421c5308271c2508597d849c79709255caf349a 100644
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -106,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 学习 Docker 有多难？
 
-  理解 Docker 并不难，大概花十分钟看一下 `这篇文章 <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+  理解 Docker 并不难，大概花十分钟看一下 `如何使用Docker <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
 
 - 我可以用 IDE 吗？
 
@@ -123,7 +123,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 可以并行编译吗？
 
-  是的。我们的 Docker image 运行一个 `Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+  是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
 
 - Docker 需要 sudo
 
@@ -131,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 在 Windows/MacOS 上编译很慢
 
-  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `这个issue <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
 
 - 磁盘不够
 
-  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `这篇文章 <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `如何删除Docker Container <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
 
 
 .. _compile_deps:
@@ -195,7 +195,7 @@ BLAS
 
 PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
 `OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
-还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+还会下载MKL-DNN数学库，详细参考 `mkldnn设计文档 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
 
 如果关闭MKL，则会使用OpenBLAS作为BLAS库。
 
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 1e3bb7bf16f969255dba6f6ec7a6a70bbb1e07ee..f279020e9334323ebdf3125a8833044cd9eccae5 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -24,31 +24,37 @@ set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
 set(inference_deps paddle_inference_api paddle_fluid_api)
 
 # if anakin is set enable anakin api implementation
-if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
+if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
     set(ANAKIN_FOUND ON)
 else()
     set(ANAKIN_FOUND OFF)
 endif()
 
+function(fetch_include_recursively root_dir) 
+    if (IS_DIRECTORY ${root_dir}) 
+        include_directories(${root_dir})
+    endif()
+
+    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
+    foreach(sub ${ALL_SUB})
+        if (IS_DIRECTORY ${root_dir}/${sub})
+            fetch_include_recursively(${root_dir}/${sub})
+        endif()
+    endforeach()
+endfunction()
+
 if (ANAKIN_FOUND)
     # Anakin's code style doesn't follow google c style.
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment
-                                            -Wno-error=reorder
-                                            -Wno-error=format
-                                            -Wno-error=switch
-                                            -Wno-error=return-type
-                                            -Wno-error=non-virtual-dtor
-                                            -Wno-error=cpp")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
 
     message(STATUS "Anakin for inference is enabled")
     message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
-    include_directories("${ANAKIN_INCLUDE}")
-    # Anakin's source path is a mass, need to set sub-directories trivially.
-    include_directories("${ANAKIN_INCLUDE}/saber")
-    link_directories("${ANAKIN_LIBRARY}")
+    fetch_include_recursively(${ANAKIN_INCLUDE})
+
+    link_directories(${ANAKIN_LIBRARY})
 
-    nv_library(inference_anakin_api SRCS paddle_inference_api_anakin_engine.cc)
-    target_link_libraries(inference_anakin_api anakin)
+    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
     list(APPEND inference_deps inference_anakin_api)
 endif()
 
@@ -73,7 +79,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 
 cc_library(paddle_inference_api
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc 
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
 cc_test(test_paddle_inference_api
@@ -84,8 +90,8 @@ inference_api_test(test_paddle_inference_api_impl
                     ARGS test_word2vec test_image_classification)
 
 if (ANAKIN_FOUND)
-  nv_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
-    DEPS ${inference_deps} protobuf)
+    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+    DEPS ${inference_deps})
 endif()
 
 if(WITH_TESTING)
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
index 9b4843f714f11484860056711fd223edc8a5d037..192a6414260ce06048b8c765402d89882cabc51b 100644
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <memory>
+#include <thread>
 #include "paddle/contrib/inference/paddle_inference_api.h"
-
 namespace paddle {
 namespace demo {
 
@@ -61,13 +61,67 @@ void Main(bool use_gpu) {
     for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
       LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
     }
+    // TODO(Superjomn): this is should be free automatically
+    free(outputs[0].data.data);
+  }
+}
+
+void MainThreads(int num_threads, bool use_gpu) {
+  // Multi-threads only support on CPU
+  // 0. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto main_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_threads; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // 1. clone a predictor which shares the same parameters
+      auto predictor = main_predictor->Clone();
+      constexpr int num_batches = 3;
+      for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
+        // 2. Dummy Input Data
+        int64_t data[4] = {1, 2, 3, 4};
+        PaddleBuf buf{.data = data, .length = sizeof(data)};
+        PaddleTensor tensor{.name = "",
+                            .shape = std::vector<int>({4, 1}),
+                            .data = buf,
+                            .dtype = PaddleDType::INT64};
+        std::vector<PaddleTensor> inputs(4, tensor);
+        std::vector<PaddleTensor> outputs;
+        // 3. Run
+        CHECK(predictor->Run(inputs, &outputs));
+
+        // 4. Get output.
+        ASSERT_EQ(outputs.size(), 1UL);
+        LOG(INFO) << "TID: " << tid << ", "
+                  << "output buffer size: " << outputs.front().data.length;
+        const size_t num_elements = outputs.front().data.length / sizeof(float);
+        // The outputs' buffers are in CPU memory.
+        for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+          LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+        }
+        free(outputs[0].data.data);
+      }
+    });
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i].join();
   }
 }
 
 TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
 
 #ifdef PADDLE_WITH_CUDA
 TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
 #endif
 
 }  // namespace demo
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index c4588cf04030b9627dbe9b40c1bb04d1e782ebba..77e2d77b6b7fe3eeed865c8de0818d059cfa6c6e 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -113,5 +113,4 @@ struct AnakinConfig : public PaddlePredictor::Config {
 // Similarly, each engine kind should map to a unique predictor implementation.
 template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
-
 }  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
index 865d7ac10db55ce9565f4b1a35defa2a3d1d40ef..ea7781f691da81befd5d11c226c35e1da79baaaa 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -24,8 +24,16 @@ PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
 }
 
 bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
-  // TODO(Superjomn) Tell anakin to support return code.
-  engine_.Build(config.model_file, config.max_batch_size);
+  if (!(graph_.load(config.model_file))) {
+    return false;
+  }
+  graph_.ResetBatchSize("input_0", config.max_batch_size);
+  // optimization for graph
+  if (!(graph_.Optimize())) {
+    return false;
+  }
+  // construct executer
+  executor_.init(graph_);
   return true;
 }
 
@@ -38,24 +46,30 @@ bool PaddleInferenceAnakinPredictor::Run(
                  << "'s type is not float";
       return false;
     }
-    engine_.SetInputFromCPU(
-        input.name, static_cast<float *>(input.data.data), input.data.length);
+    auto d_tensor_in_p = executor_.get_in(input.name);
+    float *d_data_p = d_tensor_in_p->mutable_data();
+    if (cudaMemcpy(d_data_p,
+                   static_cast<float *>(input.data.data),
+                   d_tensor_in_p->valid_size() * sizeof(float),
+                   cudaMemcpyHostToDevice) != 0) {
+      LOG(ERROR) << "copy data from CPU to GPU error";
+      return false;
+    }
   }
 
-  // TODO(Superjomn) Tell anakin to support return code.
-  engine_.Execute();
+  executor_.prediction();
 
   if (output_data->empty()) {
     LOG(ERROR) << "At least one output should be set with tensors' names.";
     return false;
   }
   for (auto &output : *output_data) {
-    auto *tensor = engine_.GetOutputInGPU(output.name);
+    auto *tensor = executor_.get_out(output.name);
     output.shape = tensor->shape();
     // Copy data from GPU -> CPU
     if (cudaMemcpy(output.data.data,
-                   tensor->data(),
-                   tensor->size(),
+                   tensor->mutable_data(),
+                   tensor->valid_size() * sizeof(float),
                    cudaMemcpyDeviceToHost) != 0) {
       LOG(ERROR) << "copy data from GPU to CPU error";
       return false;
@@ -64,9 +78,26 @@ bool PaddleInferenceAnakinPredictor::Run(
   return true;
 }
 
-// TODO(Superjomn) To implement latter.
+anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor::get_executer() {
+  return executor_;
+}
+
+// the cloned new Predictor of anakin share the same net weights from original
+// Predictor
 std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
-  return nullptr;
+  VLOG(3) << "Anakin Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
+  // construct executer from other graph
+  auto anakin_predictor_p =
+      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
+  if (!anakin_predictor_p) {
+    LOG(ERROR) << "fail to call Init";
+    return nullptr;
+  }
+  anakin_predictor_p->get_executer().init(graph_);
+
+  return std::move(cls);
 }
 
 // A factory to help create difference predictor.
@@ -74,6 +105,7 @@ template <>
 std::unique_ptr<PaddlePredictor>
 CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
     const AnakinConfig &config) {
+  VLOG(3) << "Anakin Predictor create.";
   std::unique_ptr<PaddlePredictor> x(
       new PaddleInferenceAnakinPredictor(config));
   return x;
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
index fe9f562e9d1d40c30585bcb68fa51e445bedb4aa..181784cbdf91fe2f50e20f4d447448a42a18d301 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -20,32 +20,42 @@ limitations under the License. */
 #pragma once
 
 // NOTE This header file do not have namespace.
-// TODO(Superjomn) Tell Anakin to provide better APIs.
-#include <test/framework/net/paddle_api.h>
+//#include <test/framework/net/paddle_api.h>
 #include "paddle/contrib/inference/paddle_inference_api.h"
 
+#include "framework/core/net/net.h"
+#include "saber/saber_types.h"
+
 namespace paddle {
 
 class PaddleInferenceAnakinPredictor : public PaddlePredictor {
  public:
+  PaddleInferenceAnakinPredictor() {}
+
   PaddleInferenceAnakinPredictor(const AnakinConfig& config);
 
   // NOTE Unlike the native engine, the buffers of anakin engine's output_data
   // should be allocated first.
-  // TODO(Superjomn) should unify all the behaviors of output_data accross all
-  // the engines.
   bool Run(const std::vector<PaddleTensor>& inputs,
            std::vector<PaddleTensor>* output_data) override;
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  get_executer();
+
+  ~PaddleInferenceAnakinPredictor() override{};
+
  private:
   bool Init(const AnakinConfig& config);
 
-  anakin::AnakinEngine<anakin::NV,
+  anakin::graph::Graph<anakin::NV,
                        anakin::saber::AK_FLOAT,
                        anakin::Precision::FP32>
-      engine_;
+      graph_;
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+      executor_;
+  AnakinConfig config_;
 };
 
 }  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
index 43324bc67cba16c36d9dbcb58ccde1c57293085e..47b9c6fa285b623d2b08f45917cb3474dbc2ab83 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -12,16 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "gflags/gflags.h"
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
 namespace paddle {
 
-TEST(inference, anakin) {
+AnakinConfig GetConfig() {
   AnakinConfig config;
+  config.model_file = "./mobilenet_v2.anakin.bin";
+  config.device = 0;
+  config.max_batch_size = 1;
+  return config;
+}
 
-  auto engine =
+TEST(inference, anakin) {
+  AnakinConfig config = GetConfig();
+  auto predictor =
       CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+
+  float data[1 * 3 * 224 * 224] = {1.0f};
+
+  PaddleBuf buf{.data = data, .length = sizeof(data)};
+  PaddleTensor tensor{.name = "input_0",
+                      .shape = std::vector<int>({1, 3, 224, 224}),
+                      .data = buf,
+                      .dtype = PaddleDType::FLOAT32};
+
+  // For simplicity, we set all the slots with the same data.
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  float data_out[1000];
+
+  PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
+  PaddleTensor tensor_out{.name = "prob_out",
+                          .shape = std::vector<int>({1000, 1}),
+                          .data = buf_out,
+                          .dtype = PaddleDType::FLOAT32};
+
+  std::vector<PaddleTensor> outputs(1, tensor_out);
+
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+
+  float* data_o = static_cast<float*>(outputs[0].data.data);
+  for (size_t j = 0; j < 1000; ++j) {
+    LOG(INFO) << "output[" << j << "]: " << data_o[j];
+  }
 }
 
 }  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index 1f960677163988be6f4c502738861bf86588f406..4b6cb7b051d1ad2c63e895017c7faf1245c22612 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include <thread>
+
 #include "gflags/gflags.h"
 #include "paddle/contrib/inference/paddle_inference_api_impl.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
@@ -45,14 +47,19 @@ NativeConfig GetConfig() {
   config.model_dir = FLAGS_dirname + "word2vec.inference.model";
   LOG(INFO) << "dirname  " << config.model_dir;
   config.fraction_of_gpu_memory = 0.15;
+#ifdef PADDLE_WITH_CUDA
   config.use_gpu = true;
+#else
+  config.use_gpu = false;
+#endif
   config.device = 0;
   return config;
 }
 
-TEST(paddle_inference_api_impl, word2vec) {
+void MainWord2Vec(bool use_gpu) {
   NativeConfig config = GetConfig();
   auto predictor = CreatePaddlePredictor<NativeConfig>(config);
+  config.use_gpu = use_gpu;
 
   framework::LoDTensor first_word, second_word, third_word, fourth_word;
   framework::LoD lod{{0, 1}};
@@ -100,11 +107,12 @@ TEST(paddle_inference_api_impl, word2vec) {
   free(outputs[0].data.data);
 }
 
-TEST(paddle_inference_api_impl, image_classification) {
+void MainImageClassification(bool use_gpu) {
   int batch_size = 2;
   bool use_mkldnn = false;
   bool repeat = false;
   NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
   config.model_dir =
       FLAGS_dirname + "image_classification_resnet.inference.model";
 
@@ -149,4 +157,143 @@ TEST(paddle_inference_api_impl, image_classification) {
   free(data);
 }
 
+void MainThreadsWord2Vec(bool use_gpu) {
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+
+  // prepare inputs data and reference results
+  constexpr int num_jobs = 3;
+  std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // each job has 4 words
+    jobs[i].resize(4);
+    for (size_t j = 0; j < 4; ++j) {
+      framework::LoD lod{{0, 1}};
+      int64_t dict_size = 2073;  // The size of dictionary
+      SetupLoDTensor(&jobs[i][j], lod, static_cast<int64_t>(0), dict_size - 1);
+      paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j]));
+    }
+
+    // get reference result of each job
+    std::vector<paddle::framework::LoDTensor*> ref_feeds;
+    std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    for (auto& word : jobs[i]) {
+      ref_feeds.push_back(&word);
+    }
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
+
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+
+      // check outputs range
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length;
+      float* data = static_cast<float*>(local_outputs[0].data.data);
+      for (size_t j = 0; j < len / sizeof(float); ++j) {
+        ASSERT_LT(data[j], 1.0);
+        ASSERT_GT(data[j], -1.0);
+      }
+
+      // check outputs correctness
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+      }
+      free(data);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+
+void MainThreadsImageClassification(bool use_gpu) {
+  constexpr int num_jobs = 4;  // each job run 1 batch
+  constexpr int batch_size = 1;
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  config.model_dir =
+      FLAGS_dirname + "image_classification_resnet.inference.model";
+
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  std::vector<framework::LoDTensor> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // prepare inputs
+    std::vector<std::vector<int64_t>> feed_target_shapes =
+        GetFeedTargetShapes(config.model_dir, /*is_combined*/ false);
+    feed_target_shapes[0][0] = batch_size;
+    framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
+    SetupTensor<float>(&jobs[i], input_dims, 0.f, 1.f);
+    paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i]));
+
+    // get reference result of each job
+    std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]);
+    std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
+
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+
+      // check outputs correctness
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length;
+      float* data = static_cast<float*>(local_outputs[0].data.data);
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+      }
+      free(data);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+
+TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); }
+TEST(inference_api_native, word2vec_cpu_threads) {
+  MainThreadsWord2Vec(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu_threads) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
+TEST(inference_api_native, word2vec_gpu_threads) {
+  MainThreadsWord2Vec(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu_threads) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+
+#endif
+
 }  // namespace paddle
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index dbd375aa31bfbdcb109b6302acf23b3bb3b6befe..627370cd2df7317b4d32aa967565aaf9cf0c7a08 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -87,7 +87,7 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
 framework_proto glog lod_rank_table feed_fetch_method)
 
 
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h
index 9c5e2cf7ccdcea2822da42210ff1fdb915a9a4ec..b611bb77b4e1ec05b8bd029ac37cefba346c6eb0 100644
--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
@@ -27,6 +27,7 @@ enum class DataLayout {
   kNHWC = 0,
   kNCHW = 1,
   kAnyLayout = 2,
+  kMKLDNN = 3,  // all layouts supported by MKLDNN internally
 };
 
 inline DataLayout StringToDataLayout(const std::string& str) {
@@ -41,6 +42,8 @@ inline DataLayout StringToDataLayout(const std::string& str) {
     return DataLayout::kNCHW;
   } else if (s == "ANYLAYOUT") {
     return DataLayout::kAnyLayout;
+  } else if (s == "MKLDNNLAYOUT") {
+    return DataLayout::kMKLDNN;
   } else {
     PADDLE_THROW("Unknown storage order string: %s", s);
   }
@@ -54,8 +57,10 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) {
       return "NCHW";
     case DataLayout::kAnyLayout:
       return "ANY_LAYOUT";
+    case DataLayout::kMKLDNN:
+      return "MKLDNNLAYOUT";
     default:
-      PADDLE_THROW("unknown DataLayou %d", data_layout);
+      PADDLE_THROW("unknown DataLayout %d", data_layout);
   }
 }
 
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 60ec60a427ba9046ce690eb75c27cd322fdd726d..5b8dfc57ba020cea259041f55a66472ea26b4eec 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -16,6 +16,9 @@
 #include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -88,5 +91,85 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var,
   out->set_layout(expected_kernel_type.data_layout_);
 }
 
+#ifdef PADDLE_WITH_MKLDNN
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+
+void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
+  switch (type) {
+    case mkldnn::memory::data_type::f32:
+      return platform::to_void_cast(tensor.data<float>());
+    case mkldnn::memory::data_type::s8:
+      return platform::to_void_cast(tensor.data<char>());
+    case mkldnn::memory::data_type::u8:
+      return platform::to_void_cast(tensor.data<unsigned char>());
+    case mkldnn::memory::data_type::s16:
+      return platform::to_void_cast(tensor.data<int16_t>());
+    case mkldnn::memory::data_type::s32:
+      return platform::to_void_cast(tensor.data<int32_t>());
+    default:
+      PADDLE_THROW("wrong mkldnn type provided");
+  }
+}
+#endif
+
+void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
+                               const OpKernelType& expected_kernel_type,
+                               const Tensor& in, Tensor* out) {
+  auto in_layout = kernel_type_for_var.data_layout_;
+  auto out_layout = expected_kernel_type.data_layout_;
+
+  PADDLE_ENFORCE(
+      in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN,
+      "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
+      "non-MKLDNN");
+
+#ifdef PADDLE_WITH_MKLDNN
+  PADDLE_ENFORCE(in.format() != memory::format::format_undef &&
+                     in.format() != memory::format::any,
+                 "Input tensor should have specified memory format");
+
+  // Set default as NCHW in case not specified
+  out_layout =
+      out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
+
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
+      pool.Get(expected_kernel_type.place_));
+  auto& cpu_engine = dev_ctx->GetEngine();
+
+  std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
+  std::vector<int> out_tz = in_tz;
+
+  memory::data_type in_type = ToMKLDNNDataType(in.type());
+  PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
+                 "Input tensor type is not supported: ", in.type().name());
+  memory::data_type out_type = in_type;
+
+  memory::format in_format =
+      in_tz.size() == 2 ? memory::format::nc : in.format();
+  memory::format out_format =
+      out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout);
+
+  void* in_data = GetDataFromTensor(in, in_type);
+
+  // output tensor has the same dims as input. Reorder don't change dims
+  out->Resize(in.dims());
+
+  auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
+
+  auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+  auto out_memory =
+      memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+
+  platform::Reorder(in_memory, out_memory);
+
+  out->set_layout(out_layout);
+  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
+  out->set_format(memory::format::format_undef);
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 06b638663dd334837a3bcb7737e507fcbc871c7a..2ba84ce57fd8aa3d9aa651bdaa2930e459c74e88 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <map>
 #include <vector>
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -22,6 +23,50 @@
 namespace paddle {
 namespace framework {
 
+#ifdef PADDLE_WITH_MKLDNN
+using MKLDNNFormat = mkldnn::memory::format;
+using MKLDNNDataType = mkldnn::memory::data_type;
+
+inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) {
+  switch (layout) {
+    case DataLayout::kNHWC:
+      return MKLDNNFormat::nhwc;
+    case DataLayout::kNCHW:
+      return MKLDNNFormat::nchw;
+    default:
+      PADDLE_THROW("Fail to convert layout %s to MKLDNN format",
+                   DataLayoutToString(layout));
+  }
+}
+
+inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) {
+  switch (format) {
+    case MKLDNNFormat::nhwc:
+      return DataLayout::kNHWC;
+    case MKLDNNFormat::nchw:
+      return DataLayout::kNCHW;
+    default:
+      PADDLE_THROW("Fail to convert MKLDNN format to paddle layout");
+  }
+}
+
+inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
+  static const std::map<std::type_index, MKLDNNDataType> dict{
+      {std::type_index(typeid(float)), MKLDNNDataType::f32},  // NOLINT
+      {std::type_index(typeid(char)), MKLDNNDataType::s8},    // NOLINT
+      {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8},
+      {std::type_index(typeid(int16_t)), MKLDNNDataType::s16},
+      {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}};
+  auto iter = dict.find(type);
+  if (iter != dict.end()) return iter->second;
+  return MKLDNNDataType::data_undef;
+}
+#endif
+
+void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
+                               const OpKernelType& expected_kernel_type,
+                               const Tensor& in, Tensor* out);
+
 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
 
 void TransDataLayout(const OpKernelType& kernel_type_for_var,
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 9c277a27da5af34fc9fb18ca073e369c05ecdf22..b8fcc92697ca1bf1d971f8fef020f31d405605a9 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -33,11 +33,34 @@ void DataTransform(const OpKernelType& expected_kernel_type,
   Tensor in;
   in.ShareDataWith(input_tensor);
   Tensor out;
+  DataLayout lin = kernel_type_for_var.data_layout_;
+  DataLayout lout = expected_kernel_type.data_layout_;
 
   // do layout transform
-  if (NeedTransformLayout(expected_kernel_type.data_layout_,
-                          kernel_type_for_var.data_layout_)) {
-    TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+  if (NeedTransformLayout(lout, lin)) {
+    if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) {
+      PADDLE_ENFORCE(
+          !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN),
+          "No layout transform needed between two MKLDNN OPKernels");
+
+      if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) {
+#ifdef PADDLE_WITH_MKLDNN
+        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
+        // Just set layout/format. No real transform occur
+        out.ShareDataWith(input_tensor);
+        out.set_layout(DataLayout::kMKLDNN);
+        out.set_format(ToMKLDNNFormat(lin));
+#endif
+      } else {
+        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
+        // Do transform via MKLDNN lib
+        TransDataLayoutFromMKLDNN(kernel_type_for_var, expected_kernel_type, in,
+                                  &out);
+      }
+    } else {
+      // Case3 - transfrom between Non-MKLDNN OPKernels
+      TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+    }
     transformed = true;
     PassTensorData(&out, &in);
   }
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index c026e6c100a303b43650f08cd12d7260258c8f7e..c43826b64cc5140c539df17fdd13d9bee7fefdcd 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -7,12 +7,13 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
 
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
 if(WITH_GPU)
     nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda)
+            dynload_cuda variable_visitor)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
     nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
     nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
@@ -24,10 +25,14 @@ else()
 endif()
 
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
 
+
+cc_library(graph_builder_factory SRCS graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer)
+
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 629aa00cb817c4b1446e7b750ca62a7c6b1db670..8036f756b6d6506684c109ab881d546f38176a10 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -59,8 +59,8 @@ struct BroadcastOpHandle : public OpHandleBase {
   void RunImpl() override;
 
  private:
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
 #ifdef PADDLE_WITH_CUDA
   const platform::NCCLContextMap *nccl_ctxs_;
 #endif
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 91bdfe6134ffbd1404336c9d6d1222a505084b2b..64e83acb4dc1995800c4ca3caf81668b24a7c9fe 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -29,6 +31,8 @@ struct BuildStrategy {
 
   ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
   GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
+
+  std::string debug_graphviz_path_{""};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..018c9bff71e553d8a3641f06f10b350453676b24
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
@@ -0,0 +1,51 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void FuseVarsOpHandle::RunImpl() {
+  WaitInputVarGenerated(place_);
+
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0);
+  PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
+
+  auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+
+  auto out_var_handle = out_var_handles[0];
+  auto out_var = scope->Var(out_var_handle->name_);
+
+  auto out_tensor = out_var->GetMutable<LoDTensor>();
+  out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_);
+
+  int64_t s = 0;
+  for (size_t i = 1; i < out_var_handles.size(); ++i) {
+    auto out_name = out_var_handles[i]->name_;
+    auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>();
+    auto numel = this->inputs_numel_.at(out_name);
+    out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
+    s += numel;
+  }
+  this->RunAndRecordEvent([] {});
+}
+
+std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..140fb5bb49a33146de974b6d79559b4cf15bdd7b
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
@@ -0,0 +1,63 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct FuseVarsOpHandle : public OpHandleBase {
+ public:
+  FuseVarsOpHandle(Scope *local_scope, const platform::Place &place,
+                   const std::unordered_map<std::string, int64_t> &inputs_numel,
+                   const std::type_index &var_type)
+      : local_scope_(local_scope),
+        place_(place),
+        inputs_numel_(inputs_numel),
+        type_(var_type) {
+    total_numel_ = 0;
+    for (auto in_numel : inputs_numel) {
+      PADDLE_ENFORCE_GT(in_numel.second, 0);
+      total_numel_ += in_numel.second;
+    }
+  }
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  Scope *local_scope_;
+  const platform::Place place_;
+  const std::unordered_map<std::string, int64_t> inputs_numel_;
+  const std::type_index type_;
+  int64_t total_numel_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/graph_builder_factory.cc b/paddle/fluid/framework/details/graph_builder_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a04b9bb63c06b40ff5c30c9792cdfad5d64d404c
--- /dev/null
+++ b/paddle/fluid/framework/details/graph_builder_factory.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/graph_builder_factory.h"
+#include <fstream>
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
+  std::unique_ptr<SSAGraphBuilder> res(
+#ifdef PADDLE_WITH_CUDA
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, nccl_ctxs_, strategy_)
+#else
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, strategy_)
+#endif
+          );  // NOLINT
+
+  if (!strategy_.debug_graphviz_path_.empty()) {
+    std::unique_ptr<std::ostream> fout(
+        new std::ofstream(strategy_.debug_graphviz_path_));
+    PADDLE_ENFORCE(fout->good());
+    std::unique_ptr<GraphvizSSAGraphPrinter> graphviz_printer(
+        new GraphvizSSAGraphPrinter());
+    res.reset(new SSAGraghBuilderWithPrinter(
+        std::move(fout), std::move(graphviz_printer), std::move(res)));
+  }
+  return res;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/graph_builder_factory.h b/paddle/fluid/framework/details/graph_builder_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..857ab12d684e19788597e144fc0c46571d06aafc
--- /dev/null
+++ b/paddle/fluid/framework/details/graph_builder_factory.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/platform/place.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace details {
+
+class SSAGraphBuilderFactory {
+ public:
+  SSAGraphBuilderFactory(const std::vector<platform::Place>& places,
+                         const std::string& loss_var_name,
+                         const std::unordered_set<std::string>& param_names,
+                         const std::vector<Scope*>& local_scopes,
+                         const BuildStrategy& strategy)
+      : places_(places),
+        loss_var_name_(loss_var_name),
+        param_names_(param_names),
+        local_scopes_(local_scopes),
+        strategy_(strategy) {}
+
+#ifdef PADDLE_WITH_CUDA
+  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {
+    nccl_ctxs_ = nccl_ctxs;
+  }
+#endif
+
+  std::unique_ptr<SSAGraphBuilder> Create();
+
+ private:
+  std::vector<platform::Place> places_;
+  std::string loss_var_name_;
+  std::unordered_set<std::string> param_names_;
+  std::vector<Scope*> local_scopes_;
+  BuildStrategy strategy_;
+
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap* nccl_ctxs_;
+#endif
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 17baacd13eecac8f410631fe9e94788da4fff848..97242ebf2af304e1498e2ef37cd87d1ef07fb6df 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -30,10 +30,6 @@
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #endif
 
-DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot",
-              "the ssa graph path only print with GLOG_v=10,"
-              "default /tmp/graph.dot");
-
 namespace paddle {
 namespace framework {
 namespace details {
@@ -277,11 +273,6 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    */
   AddOutputToLeafOps(&result);
 
-  if (VLOG_IS_ON(10)) {
-    std::ofstream fout(FLAGS_ssa_graph_path);
-    PrintGraphviz(*graph, fout);
-  }
-
   return std::unique_ptr<SSAGraph>(graph);
 }
 
@@ -473,9 +464,8 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
 
 void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
                                           const OpDesc &op) const {
-  auto &p = places_[0];
-  auto *s = local_scopes_[0];
-  result->ops_.emplace_back(new RPCOpHandle(op, s, p, op.Type()));
+  result->ops_.emplace_back(
+      new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0]));
 
   if (op.Type() == "send_barrier") {
     ConnectOp(result, result->ops_.back().get(), "send_vars");
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index 95aa599cd3e403e9cc66b2b5ad35d0d214d1ab5b..5bba089ade801a06e0364835efe5249105dcfcac 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -11,10 +11,12 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <algorithm>
 
+#include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
-#include <algorithm>
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
 
 namespace paddle {
 namespace framework {
@@ -30,27 +32,34 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
 }
 
 void NCCLAllReduceOpHandle::RunImpl() {
-  if (inputs_.size() == 1) {
+  if (NoDummyInputSize() == 1) {
     return;  // No need to all reduce when GPU count = 1;
   } else {
     // Wait input done
     WaitInputVarGenerated();
-
-    auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_;
-    int dtype = -1;
-    size_t numel = 0;
+    auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+    PADDLE_ENFORCE_EQ(
+        in_var_handles.size(), places_.size(),
+        "The NoDummyInputSize should be equal to the number of places.");
+    PADDLE_ENFORCE_EQ(
+        in_var_handles.size(), out_var_handles.size(),
+        "The NoDummyInputSize and NoDummyOutputSize should be equal.");
 
     std::vector<const LoDTensor *> lod_tensors;
-
     for (size_t i = 0; i < local_scopes_.size(); ++i) {
       auto *s = local_scopes_[i];
       auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
-
-      auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
+      auto &lod_tensor =
+          local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
       lod_tensors.emplace_back(&lod_tensor);
+      PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
+                        "The name of input and output should be equal.");
     }
 
     if (platform::is_gpu_place(lod_tensors[0]->place())) {
+      int dtype = -1;
+      size_t numel = 0;
       std::vector<std::function<void()>> all_reduce_calls;
       for (size_t i = 0; i < local_scopes_.size(); ++i) {
         auto &p = places_[i];
@@ -96,7 +105,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
         auto &scope =
             *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
         auto &p = places_[i];
-        auto *var = scope.FindVar(var_name);
+        auto *var = scope.FindVar(in_var_handles[i]->name_);
         auto *dev_ctx = dev_ctxes_[p];
 
         RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
index a0c321843e3fc5abcbd1ef2ce2e153250269aa7d..8e98d894b828b4162059b30f5c6a74cfc06f402e 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -41,8 +41,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
   void RunImpl() override;
 
  private:
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
   const platform::NCCLContextMap &nccl_ctxs_;
 };
 
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 6b064650b4f09737836bda4a43fa421720077929..3849cca59a3347137b769f97261cfbf97da8d6ff 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -104,6 +104,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
   }
 }
 
+size_t OpHandleBase::NoDummyInputSize() const {
+  size_t cnt = 0;
+  for (auto *in : inputs_) {
+    if (dynamic_cast<DummyVarHandle *>(in) == nullptr) {
+      ++cnt;
+    }
+  }
+  return cnt;
+}
+
 bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
   return in_var && in_var->generated_op_;
 }
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 8f94206a87dbae8a81727ca48718886bbabbe25c..dc92b0fe9f760d95d4869fdd56c0400b6710437f 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -80,6 +80,8 @@ class OpHandleBase {
 
   const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
 
+  size_t NoDummyInputSize() const;
+
  protected:
   void RunAndRecordEvent(const std::function<void()> &callback);
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index c652a2f4eb0f9b73cb19ebbd9d0809210b280ad3..4d14334cdfe06e2e805c2577458d6689e6324cc7 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -32,8 +32,8 @@ namespace framework {
 namespace details {
 
 struct ReduceOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
 
 #ifdef PADDLE_WITH_CUDA
   const platform::NCCLContextMap *nccl_ctxs_;
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
index 7f4da4c01de1010467d839ee5490c5e0d02d8c24..586465f99fd94117c821be2952bffda385fbcf75 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -19,12 +19,12 @@ namespace framework {
 namespace details {
 
 RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
-                         const Scope *local_scope, const platform::Place &place,
-                         const std::string &name)
+                         const Scope *local_scope, const std::string &name,
+                         const platform::Place &place)
     : op_(framework::OpRegistry::CreateOp(op_desc)),
       local_scope_(local_scope),
-      place_(place),
-      name_(name) {}
+      name_(name),
+      place_(place) {}
 
 void RPCOpHandle::RunImpl() {
   // TODO(wuyi): need further analysis whether wait VarDummyHandle.
diff --git a/paddle/fluid/framework/details/rpc_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h
index d28b7721720d808a8d81701c3811eae16121fb41..ae38c7fe19e102a330455d89a1068414a7835fab 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -29,7 +29,7 @@ namespace details {
 
 struct RPCOpHandle : public OpHandleBase {
   RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
-              const platform::Place& place, const std::string& name);
+              const std::string& name, const platform::Place& place);
 
   std::string Name() const override;
 
@@ -43,8 +43,8 @@ struct RPCOpHandle : public OpHandleBase {
  private:
   std::unique_ptr<OperatorBase> op_;
   const Scope* local_scope_;
-  const platform::Place& place_;
   const std::string name_;
+  platform::Place place_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 6a567527550883add08031e50aa8de2b204cf13d..211113c7979ee95d896c0a57879f7b3ad13b36ef 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -73,64 +73,6 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
   op_handle->AddOutput(var);
 }
 
-template <typename Callback>
-void IterAllVar(const SSAGraph &graph, Callback callback) {
-  for (auto &each : graph.vars_) {
-    for (auto &pair1 : each) {
-      for (auto &pair2 : pair1.second) {
-        callback(*pair2);
-      }
-    }
-  }
-
-  for (auto &var : graph.dep_vars_) {
-    callback(*var);
-  }
-}
-
-void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
-  size_t var_id = 0;
-  std::unordered_map<const VarHandleBase *, size_t> vars;
-
-  sout << "digraph G {\n";
-
-  IterAllVar(graph, [&](const VarHandleBase &var) {
-    auto *var_ptr = &var;
-    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
-    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
-
-    size_t cur_var_id = var_id++;
-    vars[var_ptr] = cur_var_id;
-
-    if (var_handle_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
-           << "\\n"
-           << var_handle_ptr->place_ << "\\n"
-           << var_handle_ptr->version_ << "\"]" << std::endl;
-    } else if (dummy_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
-    }
-  });
-
-  size_t op_id = 0;
-  for (auto &op : graph.ops_) {
-    std::string op_name = "op_" + std::to_string(op_id++);
-    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
-         << std::endl;
-    for (auto in : op->Inputs()) {
-      std::string var_name = "var_" + std::to_string(vars[in]);
-      sout << var_name << " -> " << op_name << std::endl;
-    }
-
-    for (auto out : op->Outputs()) {
-      std::string var_name = "var_" + std::to_string(vars[out]);
-      sout << op_name << " -> " << var_name << std::endl;
-    }
-  }
-
-  sout << "}\n";
-}
-
 void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
   for (auto &op : graph->ops_) {
     if (!op->Outputs().empty()) {
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index 64e5d93081eb76c56898bbeb530e37364619fdbb..5fc12a44b51fae26e5a8f5fdba952d3879e82d0f 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -55,8 +55,6 @@ class SSAGraphBuilder {
                              const platform::Place &place, size_t place_offset);
 
   static void AddOutputToLeafOps(SSAGraph *graph);
-
-  static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22a40ca4b25cdd8ed9856b6c71bffc79561edcac
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+#include <string>
+#include "paddle/fluid/framework/details/ssa_graph.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template <typename Callback>
+static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
+  for (auto &each : graph.vars_) {
+    for (auto &pair1 : each) {
+      for (auto &pair2 : pair1.second) {
+        callback(*pair2);
+      }
+    }
+  }
+
+  for (auto &var : graph.dep_vars_) {
+    callback(*var);
+  }
+}
+
+void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
+                                    std::ostream &sout) const {
+  size_t var_id = 0;
+  std::unordered_map<const VarHandleBase *, size_t> vars;
+
+  sout << "digraph G {\n";
+
+  IterAllVar(graph, [&](const VarHandleBase &var) {
+    auto *var_ptr = &var;
+    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
+    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
+
+    size_t cur_var_id = var_id++;
+    vars[var_ptr] = cur_var_id;
+
+    if (var_handle_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
+           << "\\n"
+           << var_handle_ptr->place_ << "\\n"
+           << var_handle_ptr->version_ << "\"]" << std::endl;
+    } else if (dummy_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
+    }
+  });
+
+  size_t op_id = 0;
+  for (auto &op : graph.ops_) {
+    std::string op_name = "op_" + std::to_string(op_id++);
+    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
+         << std::endl;
+    for (auto in : op->Inputs()) {
+      std::string var_name = "var_" + std::to_string(vars[in]);
+      sout << var_name << " -> " << op_name << std::endl;
+    }
+
+    for (auto out : op->Outputs()) {
+      std::string var_name = "var_" + std::to_string(vars[out]);
+      sout << op_name << " -> " << var_name << std::endl;
+    }
+  }
+
+  sout << "}\n";
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4c90013789759d17646d95efdc81fc6a0a4f3e7
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iosfwd>
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+struct SSAGraph;
+class SSAGraphPrinter {
+ public:
+  virtual ~SSAGraphPrinter() {}
+  virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0;
+};
+
+class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
+ public:
+  void Print(const SSAGraph& graph, std::ostream& sout) const override;
+};
+
+class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
+ public:
+  SSAGraghBuilderWithPrinter(std::ostream& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ref_(sout) {}
+
+  SSAGraghBuilderWithPrinter(std::unique_ptr<std::ostream>&& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ptr_(std::move(sout)),
+        stream_ref_(*stream_ptr_) {}
+
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
+    auto graph = builder_->Build(program);
+    printer_->Print(*graph, stream_ref_);
+    return graph;
+  }
+
+ private:
+  std::unique_ptr<SSAGraphPrinter> printer_;
+  std::unique_ptr<SSAGraphBuilder> builder_;
+  std::unique_ptr<std::ostream> stream_ptr_;
+  std::ostream& stream_ref_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index fab20d75f5a45257f243333c1998d7b2549a25f9..f51a184e7bae2283f335fe9462a77b9c5fb831a5 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -87,7 +87,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
 }
 
 inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
-  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
+  bool ret =
+      (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r);
+#ifdef PADDLE_WITH_MKLDNN
+  // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa
+  ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN);
+  ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN);
+#endif
+  return ret;
 }
 
 inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 748317438b44bc4af84f13b25f8e4f88386388fb..43ab227a9478707445892c14723801992d0041aa 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -83,8 +83,14 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
 
   void operator()(const char* op_type, const char* library_type) const {
     using T = typename KERNEL_TYPE::ELEMENT_TYPE;
+    std::string library(library_type);
+    std::string data_layout = "ANYLAYOUT";
+    if (library == "MKLDNN") {
+      data_layout = "MKLDNNLAYOUT";
+    }
     OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
-                     DataLayout::kAnyLayout, StringToLibraryType(library_type));
+                     StringToDataLayout(data_layout),
+                     StringToLibraryType(library_type));
     OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
 
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
@@ -99,7 +105,8 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
   void operator()(const char* op_type, const char* library_type) const {}
 };
 
-// User can register many kernel in one place. The data type could be different.
+// User can register many kernel in one place. The data type could be
+// different.
 template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
  public:
@@ -149,15 +156,15 @@ class OpKernelRegistrar : public Registrar {
 /**
  * Macro to register OperatorKernel.
  */
-#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...)        \
+#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)        \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__,                      \
+      __reg_op_kernel_##op_type##_##library_type##__,                      \
       "REGISTER_OP_KERNEL must be called in global namespace");            \
   static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
-      __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type,       \
-                                                           #LIBRARY_TYPE); \
-  int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() {                \
-    __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch();          \
+      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,       \
+                                                           #library_type); \
+  int TouchOpKernelRegistrar_##op_type##_##library_type() {                \
+    __op_kernel_registrar_##op_type##_##library_type##__.Touch();          \
     return 0;                                                              \
   }
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f87d5521492418d2daf5b7fba1500c4bb31e10f5..c633a2f847683debce08c40b0c2ed6e58c0a7ad1 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -293,6 +293,38 @@ static Tensor* GetMutableTensorFromVar(Variable* var) {
   }
 }
 
+bool ExecutionContext::HasInput(const std::string& name) const {
+  if (!op_.HasInputs(name)) {
+    return false;
+  }
+  auto& ins = Inputs(name);
+  size_t length = ins.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input %s should not have more than one inputs", name);
+  auto arg = ins[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
+
+bool ExecutionContext::HasOutput(const std::string& name) const {
+  if (!op_.HasOutputs(name)) {
+    return false;
+  }
+  auto& outs = Outputs(name);
+  size_t length = outs.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Output %s should not have more than one inputs", name);
+  auto arg = outs[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
+
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
   auto* var = InputVar(name);
@@ -444,10 +476,25 @@ class RuntimeInferShapeContext : public InferShapeContext {
     auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_lod(in_tensor.lod());
 
-    // TODO(dzhwinter) : reuse ShareLoD in most operators.
-    // Need to call ShareLayout explicitly in sequence related ops.
-    // Shall we have a better method to shared info between in/out Tensor?
-    out_tensor->set_layout(in_tensor.layout());
+// TODO(dzhwinter) : reuse ShareLoD in most operators.
+// Need to call ShareLayout explicitly in sequence related ops.
+// Shall we have a better method to shared info between in/out Tensor?
+#ifdef PADDLE_WITH_MKLDNN
+    // Fix me: ugly workaround below
+    // Correct solution:
+    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
+    //    layout of output tensor should be set "manually" in Compute()
+    //    of each OPKernel. The reason layout should NOT be shared between
+    //    input and output "automatically" (now by InferShape()->ShareLoD())
+    //    is that layout transform may occur after InferShape().
+    // Workaround:
+    //    Skip set_layout() when input layout is kMKLDNN
+    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
+    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
+    //    in Compute()
+    if (in_tensor.layout() != DataLayout::kMKLDNN)
+#endif
+      out_tensor->set_layout(in_tensor.layout());
   }
 
   void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
@@ -646,8 +693,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
         }
         if (t != nullptr) {
           int tmp = static_cast<int>(ToDataType(t->type()));
-          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                         "DataType of Paddle Op %s must be the same.", Type());
+          PADDLE_ENFORCE(
+              tmp == data_type || data_type == -1,
+              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
+              data_type, tmp);
           data_type = tmp;
         }
       }
@@ -665,7 +714,8 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
 OpKernelType OperatorWithKernel::GetKernelTypeForVar(
     const std::string& var_name, const Tensor& tensor,
     const OpKernelType& expected_kernel_type) const {
-  return OpKernelType(expected_kernel_type.data_type_, tensor.place());
+  return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
+                      tensor.layout());
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 2f480e00c100d579e100de17d3feb957f5ef6167..b1d75d0d0ff3dccc67a1e833ccfe03a4cad8df39 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -191,9 +191,9 @@ class ExecutionContext {
     return op_.Attr<T>(name);
   }
 
-  bool HasInput(const std::string& name) const { return op_.HasInputs(name); }
+  bool HasInput(const std::string& name) const;
 
-  bool HasOutput(const std::string& name) const { return op_.HasOutputs(name); }
+  bool HasOutput(const std::string& name) const;
 
   size_t InputSize(const std::string& name) const {
     return op_.Inputs(name).size();
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 003304b85af00165d54efbb199be01b2c5106768..ce56f55e4195a0625cd0754152285b80e4282183 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/graph_builder_factory.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -102,22 +102,19 @@ ParallelExecutor::ParallelExecutor(
     var_infos.back().persistable_ = var->Persistable();
   }
 
-// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
-// ncclOp
-#ifdef PADDLE_WITH_CUDA
-  details::MultiDevSSAGraphBuilder builder(
+  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+  // ncclOp
+
+  details::SSAGraphBuilderFactory builder_factory(
       member_->places_, loss_var_name, params, member_->local_scopes_,
-      member_->nccl_ctxs_.get(), build_strategy);
-#else
-  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
-                                           params, member_->local_scopes_,
-                                           build_strategy);
+      build_strategy);
+#ifdef PADDLE_WITH_CUDA
+  builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
 #endif
 
-  auto graph = builder.Build(main_program);
-
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      exec_strategy, member_->local_scopes_, places, std::move(graph)));
+      exec_strategy, member_->local_scopes_, places,
+      builder_factory.Create()->Build(main_program)));
 
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, std::move(var_infos),
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 9091713158c8071d5386f14250e3c546284e7fd0..bb2d866c824e0fec1b241caea407a38c88a3cb51 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -34,13 +34,7 @@ DEFINE_bool(
 namespace paddle {
 namespace framework {
 
-Scope::~Scope() {
-  DropKids();
-  for (auto& kv : vars_) {
-    VLOG(3) << "Destroy variable " << kv.first;
-    delete kv.second;
-  }
-}
+Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
   std::unique_lock<std::mutex> lock(mutex_);
@@ -49,10 +43,13 @@ Scope& Scope::NewScope() const {
 }
 
 Variable* Scope::Var(const std::string& name) {
+  // acquire the lock when new var under this scope
+  std::unique_lock<std::mutex> lock(mutex_);
   auto* v = FindVarLocally(name);
   if (v != nullptr) return v;
+
   v = new Variable();
-  vars_[name] = v;
+  vars_[name].reset(v);
   VLOG(3) << "Create variable " << name;
   v->name_ = &(vars_.find(name)->first);
   return v;
@@ -67,22 +64,29 @@ Variable* Scope::Var(std::string* name) {
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
+  // acquire the lock when find var
+  std::unique_lock<std::mutex> lock(mutex_);
+  return FindVarInternal(name);
+}
+
+Variable* Scope::FindVarInternal(const std::string& name) const {
   auto var = FindVarLocally(name);
   if (var != nullptr) {
     return var;
   }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+  return (parent_ == nullptr) ? nullptr : parent_->FindVarInternal(name);
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
   for (auto& kv : vars_) {
-    if (kv.second == var) {
+    if (kv.second.get() == var) {
       return this;
     }
   }
   return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
 }
 void Scope::DropKids() {
+  std::unique_lock<std::mutex> lock(mutex_);
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
@@ -110,10 +114,10 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
+  std::unique_lock<std::mutex> lock(mutex_);
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
-      delete it->second;
       it = vars_.erase(it);
     } else {
       ++it;
@@ -129,7 +133,7 @@ void Scope::Rename(const std::string& origin_name,
   auto new_it = vars_.find(new_name);
   PADDLE_ENFORCE(new_it == vars_.end(),
                  "The variable with name %s is already in the scope", new_name);
-  vars_[new_name] = origin_it->second;
+  vars_[new_name].reset(origin_it->second.release());
   vars_.erase(origin_it);
 }
 
@@ -141,7 +145,7 @@ std::string Scope::Rename(const std::string& origin_name) const {
 
 Variable* Scope::FindVarLocally(const std::string& name) const {
   auto it = vars_.find(name);
-  if (it != vars_.end()) return it->second;
+  if (it != vars_.end()) return it->second.get();
   return nullptr;
 }
 
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index abc82e452d732638a2f7315022074850f299a7ea..98d103d867987fc02dc66df5ac855a14b66b8f03 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -47,15 +47,18 @@ class Scope {
   Scope& NewScope() const;
 
   /// Create a variable with given name if it doesn't exist.
+  /// Caller doesn't own the returned Variable.
   Variable* Var(const std::string& name);
 
   /// Create a variable with a scope-unique name.
+  /// Caller doesn't own the returned Variable.
   Variable* Var(std::string* name = nullptr);
 
   void EraseVars(const std::vector<std::string>& var_names);
 
   /// Find a variable in the scope or any of its ancestors.  Returns
   /// nullptr if cannot find.
+  /// Caller doesn't own the returned Variable.
   Variable* FindVar(const std::string& name) const;
 
   const Scope* parent() const { return parent_; }
@@ -78,13 +81,21 @@ class Scope {
   // Rename variable to a new name and return the new name
   std::string Rename(const std::string& origin_name) const;
 
-  Variable* FindVarLocally(const std::string& name) const;
-
  private:
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
 
-  mutable std::unordered_map<std::string, Variable*> vars_;
+  // Called by FindVar recursively.
+  // Caller doesn't own the returned Variable.
+  Variable* FindVarInternal(const std::string& name) const;
+
+  // Called by FindVarInternal and Var.
+  // Caller doesn't own the returned Variable.
+  Variable* FindVarLocally(const std::string& name) const;
+
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+
+  // Scope in `kids_` are owned by this class.
   mutable std::list<Scope*> kids_;
   Scope const* parent_{nullptr};
 
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index e97ada06f06d0538f17160220e3aa3f4ffc55520..c7286dacf01659f3af0927a71856e5a6496cb877 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -15,5 +15,102 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
-namespace framework {}
+namespace framework {
+extern size_t SizeOfType(std::type_index type);
+void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_LE(
+      numel() * SizeOfType(type()), memory_size(),
+      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+      "first to re-allocate memory.\n"
+      "or maybe the required data-type mismatches the data already stored.");
+}
+
+size_t Tensor::memory_size() const {
+  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
+}
+
+void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+  if (holder_ != nullptr) {
+    holder_->set_type(type);
+  }
+  PADDLE_ENFORCE_GE(numel(), 0,
+                    "When calling this method, the Tensor's numel must be "
+                    "equal or larger than zero. "
+                    "Please check Tensor::Resize has been called first.");
+  int64_t size = numel() * SizeOfType(type);
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size, type));
+    } else if (platform::is_gpu_place(place) ||
+               platform::is_cuda_pinned_place(place)) {
+#ifndef PADDLE_WITH_CUDA
+      PADDLE_THROW(
+          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
+    }
+#else
+      if (platform::is_gpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+            boost::get<platform::CUDAPlace>(place), size, type));
+      } else if (platform::is_cuda_pinned_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
+            boost::get<platform::CUDAPinnedPlace>(place), size, type));
+      }
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+
+void* Tensor::mutable_data(platform::Place place) {
+  PADDLE_ENFORCE(this->holder_ != nullptr,
+                 "Cannot invoke mutable data if current hold nothing.");
+  return mutable_data(place, holder_->type());
+}
+
+Tensor& Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size();
+  *this = src;
+  return *this;
+}
+
+Tensor Tensor::Slice(int begin_idx, int end_idx) const {
+  check_memory_size();
+  PADDLE_ENFORCE_GE(begin_idx, 0,
+                    "The start row index must be greater than 0.");
+  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
+  PADDLE_ENFORCE_LT(
+      begin_idx, end_idx,
+      "The start row index must be lesser than the end row index.");
+
+  if (dims_[0] == 1) {
+    return *this;
+  } else {
+    size_t base = numel() / dims_[0];
+    Tensor dst;
+    dst.holder_ = holder_;
+    dst.set_layout(layout_);
+    DDim dst_dims = dims_;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.Resize(dst_dims);
+    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
+    return dst;
+  }
+}
+
+Tensor& Tensor::Resize(const DDim& dims) {
+  dims_ = dims;
+  return *this;
+}
+
+const DDim& Tensor::dims() const { return dims_; }
+
+int64_t Tensor::numel() const { return product(dims_); }
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 6f878541e6de1deec1829145b1b325ecd176a034..ef224d68f1fc561f45e9d7a81425e62655457648 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -34,6 +34,28 @@ namespace framework {
 class LoDTensor;
 
 class Tensor {
+#ifdef PADDLE_WITH_MKLDNN
+
+ public:
+  inline mkldnn::memory::format format() const { return format_; }
+
+  inline void set_format(const mkldnn::memory::format format) {
+    format_ = format;
+  }
+
+ protected:
+  /**
+   * @brief the detail format of memory block which have layout as kMKLDNN
+   *
+   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
+   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
+   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
+   *       this field.
+   */
+
+  mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
+#endif
+
  public:
   template <typename T, size_t D, int MajorType, typename IndexType>
   friend struct EigenTensor;
@@ -54,26 +76,24 @@ class Tensor {
 
   /*! Return a pointer to mutable memory block. */
   template <typename T>
-  inline T* data();
+  T* data();
 
   /*! Return a pointer to constant memory block. */
   template <typename T>
-  inline const T* data() const;
+  const T* data() const;
 
-  inline bool IsInitialized() const;
-
-  inline void switch_place(platform::Place new_place);
+  bool IsInitialized() const;
 
   /**
    * @brief   Return a pointer to mutable memory block.
    * @note    If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(platform::Place place);
+  T* mutable_data(platform::Place place);
 
-  inline void* mutable_data(platform::Place place, std::type_index type);
+  void* mutable_data(platform::Place place, std::type_index type);
 
-  inline void* mutable_data(platform::Place place);
+  void* mutable_data(platform::Place place);
 
   /**
    * @brief     Return a pointer to mutable memory block.
@@ -84,19 +104,19 @@ class Tensor {
    * @note      If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place);
+  T* mutable_data(DDim dims, platform::Place place);
 
   /*! Return the dimensions of the memory block. */
-  inline const DDim& dims() const;
+  const DDim& dims() const;
 
   /*! Return the numel of the memory block. */
-  inline int64_t numel() const;
+  int64_t numel() const;
 
   /*! Resize the dimensions of the memory block. */
-  inline Tensor& Resize(const DDim& dims);
+  Tensor& Resize(const DDim& dims);
 
   /*! The internal of two tensors share the same memory block. */
-  inline Tensor& ShareDataWith(const Tensor& src);
+  Tensor& ShareDataWith(const Tensor& src);
 
   /**
    * @brief  Return a sub-tensor of the given tensor.
@@ -106,7 +126,7 @@ class Tensor {
    * @param[in] end_idx     The index of the end row(exclusive) to slice.
    *                        The index number begins from 0.
    */
-  inline Tensor Slice(int begin_idx, int end_idx) const;
+  Tensor Slice(int begin_idx, int end_idx) const;
 
   platform::Place place() const {
     PADDLE_ENFORCE_NOT_NULL(
@@ -123,11 +143,11 @@ class Tensor {
   // memory size returns the holding memory size in byte.
   size_t memory_size() const;
 
-  inline void check_memory_size() const;
+  void check_memory_size() const;
 
-  inline DataLayout layout() const { return layout_; }
+  DataLayout layout() const { return layout_; }
 
-  inline void set_layout(const DataLayout layout) { layout_ = layout; }
+  void set_layout(const DataLayout layout) { layout_ = layout; }
 
  private:
   /**
@@ -197,8 +217,10 @@ class Tensor {
    *       N,C,H,W for respectively the batch size, the number of
    *       feature maps, the height.
    */
-
-  DataLayout layout_ = DataLayout::kNHWC;
+  // Fix me: here just change the default layout to kNCHW
+  // it doesn't fix the real issue, i.e. feeder should set up tensor layout
+  // according to actual input data
+  DataLayout layout_ = DataLayout::kNCHW;
 
   /**
    * @brief   A PlaceHolder may be shared by more than one tensor.
@@ -210,15 +232,6 @@ class Tensor {
   size_t offset_;
 };
 
-inline void Tensor::switch_place(platform::Place new_place) {
-  if (holder_->place() == new_place) {
-    return;
-  }
-
-  // TODO(tonyyang-svail): do memcpy here.
-  PADDLE_THROW("Not Implemented");
-}
-
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 2f19ec0f0a9338e2b96d1f64eac45387bae4d1eb..96114678a9992f2975c4173c7cc003114f04d8df 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -20,21 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-extern size_t SizeOfType(std::type_index type);
-inline void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
-  PADDLE_ENFORCE_LE(
-      numel() * SizeOfType(type()), memory_size(),
-      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-      "first to re-allocate memory.\n"
-      "or maybe the required data-type mismatches the data already stored.");
-}
-
-inline size_t Tensor::memory_size() const {
-  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
-}
-
 template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
@@ -73,88 +58,6 @@ inline T* Tensor::mutable_data(platform::Place place) {
   return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
 }
 
-inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
-  if (holder_ != nullptr) {
-    holder_->set_type(type);
-  }
-  PADDLE_ENFORCE_GE(numel(), 0,
-                    "When calling this method, the Tensor's numel must be "
-                    "equal or larger than zero. "
-                    "Please check Tensor::Resize has been called first.");
-  int64_t size = numel() * SizeOfType(type);
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + offset_) {
-    if (platform::is_cpu_place(place)) {
-      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type));
-    } else if (platform::is_gpu_place(place) ||
-               platform::is_cuda_pinned_place(place)) {
-#ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW(
-          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
-    }
-#else
-      if (platform::is_gpu_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-            boost::get<platform::CUDAPlace>(place), size, type));
-      } else if (platform::is_cuda_pinned_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
-            boost::get<platform::CUDAPinnedPlace>(place), size, type));
-      }
-    }
-#endif
-    offset_ = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-}
-
-inline void* Tensor::mutable_data(platform::Place place) {
-  PADDLE_ENFORCE(this->holder_ != nullptr,
-                 "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type());
-}
-
-inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size();
-  *this = src;
-  return *this;
-}
-
-inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0,
-                    "The start row index must be greater than 0.");
-  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
-  PADDLE_ENFORCE_LT(
-      begin_idx, end_idx,
-      "The start row index must be lesser than the end row index.");
-
-  if (dims_[0] == 1) {
-    return *this;
-  } else {
-    size_t base = numel() / dims_[0];
-    Tensor dst;
-    dst.holder_ = holder_;
-    dst.set_layout(layout_);
-    DDim dst_dims = dims_;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
-    return dst;
-  }
-}
-
-inline Tensor& Tensor::Resize(const DDim& dims) {
-  dims_ = dims;
-  return *this;
-}
-
-inline const DDim& Tensor::dims() const { return dims_; }
-
-inline int64_t Tensor::numel() const { return product(dims_); }
-
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   Tensor res;
   res.ShareDataWith(src);
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index e1012de2ec36eb4a858202d56a678b6a204c2f0a..0a1cb6d5703dace5e6be73285655ecd9d2ad89fb 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -209,7 +209,7 @@ TEST(Tensor, ReshapeToMatrix) {
 
 TEST(Tensor, Layout) {
   framework::Tensor src;
-  ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
+  ASSERT_EQ(src.layout(), framework::DataLayout::kNCHW);
   src.set_layout(framework::DataLayout::kAnyLayout);
   ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
 }
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 153dca576bd6734d62f00c4a7cb9b503506b33e2..58eb0e715cb71d87179f3240de55021603cd7423 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -107,6 +109,13 @@ class OrderedRegistry {
   std::vector<std::unique_ptr<T>> data_;
 };
 
+template <typename T>
+T &GetFromScope(const framework::Scope &scope, const std::string &name) {
+  framework::Variable *var = scope.FindVar(name);
+  PADDLE_ENFORCE(var != nullptr);
+  return *var->GetMutable<T>();
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 23ca8bfac84f35ebdca2e2a1a8538d366358ca8b..748f5a084e8c880df215a60fe51c835ba5cd3110 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,12 +1,15 @@
 # Add TRT tests
-nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine)
-# This test is not stable
-# See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828 
-#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
-#    DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
-#    SERIAL)
+nv_library(tensorrt_converter
+  SRCS mul_op.cc conv2d_op.cc fc_op.cc
+  DEPS tensorrt_engine mul_op)
+
+nv_test(test_op_converter SRCS test_op_converter.cc DEPS
+  ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
+
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
 nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 79d01b640a214ed5eb86173a36d5e85a6626066f..e1cace9cc1b06f036f52e82b7b86c99a02d50f50 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
@@ -21,7 +22,8 @@ namespace tensorrt {
 class ReluOpConverter : public OpConverter {
  public:
   ReluOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
@@ -32,12 +34,17 @@ class ReluOpConverter : public OpConverter {
     nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
         engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
         nvinfer1::ActivationType::kRELU);
-    engine_->SetITensor(op_desc.Output("Out")[0], layer->getOutput(0));
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
-REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 668d344f1bba1c012dcb42c71b996209b4703d78..8e7e23377d4b2fe7afd51f1f58048fc4ed3c6d99 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -22,14 +22,14 @@ class Conv2dOpConverter : public OpConverter {
  public:
   Conv2dOpConverter() {}
   void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope) override {
+                  const framework::Scope& scope, bool test_mode) override {
     LOG(INFO)
         << "convert a fluid conv2d op to tensorrt conv layer without bias";
   }
 };
 
-REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 45b079559754a8f5c3fe39781b5700a75f425e99..bb603efaf30bb72d74b5583abc45d01a16c076a3 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -56,7 +56,7 @@ void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
 class FcOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope) override {
+                  const framework::Scope& scope, bool test_mode) override {
     VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
@@ -106,14 +106,16 @@ class FcOpConverter : public OpConverter {
                                        n_output, weight.get(), bias.get());
 
     auto output_name = op_desc.Output("Out").front();
-    engine_->DeclareOutput(layer, 0, output_name);
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
-REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
 
+REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
 USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index 6bb07709c7ee1c6b29c46425849a4f472d3df59d..3c342957360ad4192d838147bf37e84d233c2629 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -23,9 +23,8 @@ namespace tensorrt {
  */
 class MulOpConverter : public OpConverter {
  public:
-  MulOpConverter() {}
   void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope) override {
+                  const framework::Scope& scope, bool test_mode) override {
     VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
@@ -37,12 +36,18 @@ class MulOpConverter : public OpConverter {
         engine_, MatrixMultiply, *const_cast<nvinfer1::ITensor*>(input1), false,
         *const_cast<nvinfer1::ITensor*>(input2), false);
 
-    engine_->DeclareOutput(layer, 0, op_desc.Output("Out")[0]);
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
-REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+USE_OP(mul);
+REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 3beafeefd06f24ec50b0e61c1fabe13d7e53f242..c7a5a49dd02d0db022fabff5c3ae1c7800bac25c 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
@@ -34,12 +35,15 @@ class OpConverter {
 
   // Converter logic for an op.
   virtual void operator()(const framework::proto::OpDesc& op,
-                          const framework::Scope& scope) {}
+                          const framework::Scope& scope,
+                          bool test_mode = false) {}
 
-  // Convert a single fluid operaotr and add the corresponding layer to TRT.
+  // Convert a single fluid operator and add the corresponding layer to TRT.
+  // test_mode: whether the instance executes in an unit test.
   void ConvertOp(const framework::proto::OpDesc& op,
                  const std::unordered_set<std::string>& parameters,
-                 const framework::Scope& scope, TensorRTEngine* engine) {
+                 const framework::Scope& scope, TensorRTEngine* engine,
+                 bool test_mode = false) {
     framework::OpDesc op_desc(op, nullptr);
 
     OpConverter* it{nullptr};
@@ -57,7 +61,7 @@ class OpConverter {
     PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
                             op_desc.Type());
     it->SetEngine(engine);
-    (*it)(op, scope);
+    (*it)(op, scope, test_mode);
   }
 
   // convert fluid block to tensorrt network
@@ -77,6 +81,9 @@ class OpConverter {
   // TensorRT engine
   TensorRTEngine* engine_{nullptr};
 
+ protected:
+  bool test_mode_;
+
  private:
   // registered op converter map, whose key is the fluid op type, and value is
   // the pointer position of corresponding OpConverter class.
@@ -85,13 +92,24 @@ class OpConverter {
   framework::Scope* scope_{nullptr};
 };
 
-#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)       \
-  struct trt_##op_type__##_converter {                          \
-    trt_##op_type__##_converter() {                             \
-      Registry<OpConverter>::Register<Converter__>(#op_type__); \
-    }                                                           \
-  };                                                            \
-  trt_##op_type__##_converter trt_##op_type__##_converter__;
+#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
+  struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
+    trt_##op_type__##_converter() {                                            \
+      ::paddle::inference::                                                    \
+          Registry<paddle::inference::tensorrt::OpConverter>::Register<        \
+              ::paddle::inference::tensorrt::Converter__>(#op_type__);         \
+    }                                                                          \
+  };                                                                           \
+  trt_##op_type__##_converter trt_##op_type__##_converter__;                   \
+  int TouchConverterRegister_##op_type__() {                                   \
+    trt_##op_type__##_converter__.Touch();                                     \
+    return 0;                                                                  \
+  }
+
+#define USE_TRT_CONVERTER(op_type__)                                    \
+  extern int TouchConverterRegister_##op_type__();                      \
+  static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
+      TouchConverterRegister_##op_type__();
 
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 86ca2ca08eb14265e1bfe7abd5eb6af5c83b8a5c..0a02a7bebf9efbd0555707e6cfa701ef1e7d9659 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -1,106 +1,47 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/place.h"
-
-USE_OP(relu);
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-void Compare(const std::string op_type, float input, float expect) {
+TEST(ReluOpConverter, main) {
   framework::Scope scope;
-  platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
-
-  // init fluid op and variable
-  auto x_var = scope.Var("X");
-  auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
-  x_tensor->Resize({1, 1});
-  x_tensor->mutable_data<float>(place);
-  std::vector<float> init;
-  init.push_back(input);
-  framework::TensorFromVector(init, ctx, x_tensor);
-
-  auto out_var = scope.Var("Out");
-  auto out_tensor = out_var->GetMutable<framework::LoDTensor>();
-  out_tensor->Resize({1, 1});
-  out_tensor->mutable_data<float>(place);
-
-  framework::OpDesc op_desc;
-  op_desc.SetType(op_type);
-  op_desc.SetInput("X", {"X"});
-  op_desc.SetOutput("Out", {"Out"});
-
-  auto op = framework::OpRegistry::CreateOp(*op_desc.Proto());
-
-  // run fluid op
-  op->Run(scope, place);
-  // get fluid output
-  std::vector<float> out1;
-  framework::TensorToVector(*out_tensor, ctx, &out1);
-
-  // init tensorrt op
-  cudaStream_t stream;
-  ASSERT_EQ(0, cudaStreamCreate(&stream));
-  TensorRTEngine* engine = new TensorRTEngine(1, 1 << 10, &stream);
-  engine->InitNetwork();
-  engine->DeclareInput("X", nvinfer1::DataType::kFLOAT,
-                       nvinfer1::DimsCHW{1, 1, 1});
-  // convert op
-  OpConverter op_converter;
-  op_converter.ConvertOp(*op_desc.Proto(), engine);
-
-  engine->DeclareOutput("Out");
-  engine->FreezeNetwork();
-
-  // convert LoDTensor to ITensor
-  size_t size = x_tensor->memory_size();
-  EngineIOConverter::ConvertInput(op_type, *x_tensor,
-                                  engine->buffer("X").buffer, size, &stream);
-  // run tensorrt Outp
-  engine->Execute(1);
-  // convert ITensor to LoDTensor
-  EngineIOConverter::ConvertOutput(op_type, engine->buffer("Out").buffer,
-                                   out_tensor, size, &stream);
-  // get tensorrt output
-  std::vector<float> out2;
-  framework::TensorToVector(*out_tensor, ctx, &out2);
-
-  // compare
-  ASSERT_EQ(out1[0], out2[0]);
-  ASSERT_EQ(out1[0], expect);
-
-  delete engine;
-  cudaStreamDestroy(stream);
-}
-
-TEST(OpConverter, ConvertRelu) {
-  Compare("relu", 1, 1);   // relu(1) = 1
-  Compare("relu", -5, 0);  // relu(-5) = 0
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6));
+  validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("relu");
+  desc.SetInput("X", {"relu-X"});
+  desc.SetOutput("Out", {"relu-Out"});
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(10);
 }
 
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(activation);
+USE_OP(relu);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 1d3f5eabb2f839b2acfa9da6527589df1ec3767f..9b79f86b0edba983019bd932f52b08711ff36d41 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -36,3 +36,5 @@ TEST(OpConverter, ConvertBlock) {
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+USE_TRT_CONVERTER(conv2d)
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index d7e05dd5b5b235b7b166b22c5b094dc364e28dfc..236d169017f65e4c9d513c3ca4511daa2dfee06e 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
 namespace inference {
@@ -104,8 +105,8 @@ class TRTConvertValidation {
   void SetOp(const framework::proto::OpDesc& desc) {
     op_ = framework::OpRegistry::CreateOp(desc);
 
-    OpConverter op_converter;
-    op_converter.ConvertOp(desc, parameters_, scope_, engine_.get());
+    Singleton<OpConverter>::Global().ConvertOp(
+        desc, parameters_, scope_, engine_.get(), true /*test_mode*/);
 
     engine_->FreezeNetwork();
 
@@ -150,7 +151,8 @@ class TRTConvertValidation {
       // Compare two output
       ASSERT_FALSE(fluid_out.empty());
       for (size_t i = 0; i < fluid_out.size(); i++) {
-        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6);
+        // Loose the threshold for CI in different machine model.
+        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
       }
     }
   }
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 3d75fefc1a735168131a6c67ac073e80aba32945..596e0fe9da3d272ecb1c0f8dbef09a75d08a4b1a 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -43,9 +43,10 @@ void TensorRTEngine::Execute(int batch_size) {
 }
 
 TensorRTEngine::~TensorRTEngine() {
+  cudaStreamSynchronize(*stream_);
   // clean buffer
   for (auto& buf : buffers_) {
-    if (buf.buffer != nullptr) {
+    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
       PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
       buf.buffer = nullptr;
       buf.max_size = 0;
@@ -80,6 +81,8 @@ void TensorRTEngine::FreezeNetwork() {
     auto& buf = buffer(item.first);
     CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
     PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
+    VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
+            << buf.buffer;
     buf.size = buf.max_size = item.second;
     buf.device = DeviceType::GPU;
   }
@@ -96,6 +99,7 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
   PADDLE_ENFORCE(input, "infer network add input %s failed", name);
   buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
                         analysis::AccuDims(dims.d, dims.nbDims);
+  PADDLE_ENFORCE(input->isNetworkInput());
   TensorRTEngine::SetITensor(name, input);
   return input;
 }
@@ -109,7 +113,9 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
   SetITensor(name, output);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
+  PADDLE_ENFORCE(!output->isNetworkInput());
   infer_network_->markOutput(*output);
+  PADDLE_ENFORCE(output->isNetworkOutput());
   // output buffers' size can only be decided latter, set zero here to mark this
   // and will reset latter.
   buffer_sizes_[name] = 0;
@@ -122,6 +128,7 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
   auto* output = TensorRTEngine::GetITensor(name);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
+  PADDLE_ENFORCE(!output->isNetworkInput());
   infer_network_->markOutput(*output);
   // output buffers' size can only be decided latter, set zero here to mark this
   // and will reset latter.
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index fabcfd9e80cc0ef2637201a1499ebbe2d6adfd8c..b60f00de9fa5fc8f8f4537379bf9ee9c8bb6f31c 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
 namespace inference {
@@ -131,7 +132,11 @@ class TensorRTEngine : public EngineBase {
   // TensorRT related internal members
   template <typename T>
   struct Destroyer {
-    void operator()(T* x) { x->destroy(); }
+    void operator()(T* x) {
+      if (x) {
+        x->destroy();
+      }
+    }
   };
   template <typename T>
   using infer_ptr = std::unique_ptr<T, Destroyer<T>>;
@@ -155,6 +160,27 @@ class TensorRTEngine : public EngineBase {
 #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
   engine__->network()->add##layer__(ARGS);
 
+/*
+ * Helper to control the TensorRT engine's creation and deletion.
+ */
+class TRT_EngineManager {
+ public:
+  TensorRTEngine* Create(int max_batch, int max_workspace,
+                         cudaStream_t* stream) {
+    engines_.emplace_back(new TensorRTEngine(max_batch, max_workspace, stream));
+    return engines_.back().get();
+  }
+
+  void DeleteALl() {
+    for (auto& ptr : engines_) {
+      ptr.reset(nullptr);
+    }
+  }
+
+ private:
+  std::vector<std::unique_ptr<TensorRTEngine>> engines_;
+};
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 70aa42ac4111c0524a55e26aaefa864338c1d6c1..a0e83a17058a4edcb8f23f23ce155e644ae0cf3b 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -101,23 +101,22 @@ void SplitData(
 }
 
 void ThreadRunInfer(
-    const int tid, paddle::framework::Executor* executor,
-    paddle::framework::Scope* scope,
-    const std::unique_ptr<paddle::framework::ProgramDesc>& inference_program,
+    const int tid, paddle::framework::Scope* scope,
     const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
-  auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
-      new paddle::framework::ProgramDesc(*inference_program));
+  // maybe framework:ProgramDesc is not thread-safe
   auto& sub_scope = scope->NewScope();
+  auto place = paddle::platform::CPUPlace();
+  auto executor = paddle::framework::Executor(place);
+  auto inference_program =
+      paddle::inference::Load(&executor, scope, FLAGS_model_path);
 
-  std::string feed_holder_name = "feed_" + paddle::string::to_string(tid);
-  std::string fetch_holder_name = "fetch_" + paddle::string::to_string(tid);
-  copy_program->SetFeedHolderName(feed_holder_name);
-  copy_program->SetFetchHolderName(fetch_holder_name);
+  auto ctx = executor.Prepare(*inference_program, /*block_id*/ 0);
+  executor.CreateVariables(*inference_program, &sub_scope, /*block_id*/ 0);
 
   const std::vector<std::string>& feed_target_names =
-      copy_program->GetFeedTargetNames();
+      inference_program->GetFeedTargetNames();
   const std::vector<std::string>& fetch_target_names =
-      copy_program->GetFetchTargetNames();
+      inference_program->GetFetchTargetNames();
 
   PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
   std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
@@ -131,9 +130,8 @@ void ThreadRunInfer(
   auto start_ms = GetCurrentMs();
   for (size_t i = 0; i < inputs.size(); ++i) {
     feed_targets[feed_target_names[0]] = inputs[i];
-    executor->Run(*copy_program, &sub_scope, &feed_targets, &fetch_targets,
-                  true /*create_local_scope*/, true /*create_vars*/,
-                  feed_holder_name, fetch_holder_name);
+    executor.RunPreparedContext(ctx.get(), &sub_scope, &feed_targets,
+                                &fetch_targets, false /*create_local_scope*/);
   }
   auto stop_ms = GetCurrentMs();
   scope->DeleteScope(&sub_scope);
@@ -158,22 +156,10 @@ TEST(inference, nlp) {
   LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size();
   LOG(INFO) << "Total number of words: " << num_total_words;
 
-  const bool model_combined = false;
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // 1. Define place, executor, scope
-  auto place = paddle::platform::CPUPlace();
-  auto executor = paddle::framework::Executor(place);
   std::unique_ptr<paddle::framework::Scope> scope(
       new paddle::framework::Scope());
 
-  // 2. Initialize the inference_program and load parameters
-  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
-  inference_program =
-      InitProgram(&executor, scope.get(), FLAGS_model_path, model_combined);
-  if (FLAGS_use_mkldnn) {
-    EnableMKLDNN(inference_program);
-  }
-
 #ifdef PADDLE_WITH_MKLML
   // only use 1 thread number per std::thread
   omp_set_dynamic(0);
@@ -189,21 +175,30 @@ TEST(inference, nlp) {
     start_ms = GetCurrentMs();
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads.emplace_back(
-          new std::thread(ThreadRunInfer, i, &executor, scope.get(),
-                          std::ref(inference_program), std::ref(jobs)));
+          new std::thread(ThreadRunInfer, i, scope.get(), std::ref(jobs)));
     }
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads[i]->join();
     }
     stop_ms = GetCurrentMs();
   } else {
-    if (FLAGS_prepare_vars) {
-      executor.CreateVariables(*inference_program, scope.get(), 0);
+    // 1. Define place, executor, scope
+    auto place = paddle::platform::CPUPlace();
+    auto executor = paddle::framework::Executor(place);
+
+    // 2. Initialize the inference_program and load parameters
+    std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+    inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path,
+                                    /*model combined*/ false);
+    if (FLAGS_use_mkldnn) {
+      EnableMKLDNN(inference_program);
     }
     // always prepare context
     std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
     ctx = executor.Prepare(*inference_program, 0);
-
+    if (FLAGS_prepare_vars) {
+      executor.CreateVariables(*inference_program, scope.get(), 0);
+    }
     // preapre fetch
     const std::vector<std::string>& fetch_target_names =
         inference_program->GetFetchTargetNames();
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index de6ff29c6f8edbcf930546ff157a1c226e1311db..5e86b16ba1ff69c798372a144fb3bf699768f2e6 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -166,8 +166,6 @@ function(op_library TARGET)
       # NOTE(*): activation use macro to regist the kernels, set use_op manually.
       if(${TARGET} STREQUAL "activation")
         file(APPEND ${pybind_file} "USE_OP(relu);\n")
-      elseif(${TARGET} STREQUAL "reduce")
-        file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
         file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
       else()
@@ -227,6 +225,8 @@ op_library(softmax_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
     op_library(tensorrt_engine_op DEPS tensorrt_engine)
+    nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
+      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index dd71c66a75a039429f6e4b1771bb31508bb6b56d..96e4c0e04cc30db6d0b86376434d5ea02694ae21 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -24,12 +24,12 @@ namespace operators {
       : public ::paddle::framework::OpProtoAndCheckerMaker {            \
    public:                                                              \
     void Make() override {                                              \
-      AddInput("X", "Input of " #OP_NAME "operator");                   \
-      AddOutput("Out", "Output of" #OP_NAME "operator");                \
+      AddInput("X", "Input of " #OP_NAME " operator");                  \
+      AddOutput("Out", "Output of " #OP_NAME " operator");              \
       AddAttr<bool>("use_mkldnn",                                       \
                     "(bool, default false) Only used in mkldnn kernel") \
           .SetDefault(false);                                           \
-      AddComment(#OP_COMMENT);                                          \
+      AddComment(OP_COMMENT);                                           \
     }                                                                   \
   }
 
@@ -58,14 +58,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                       const framework::OperatorWithKernel& oper,
                                       const std::string& name) {
   framework::LibraryType library{framework::LibraryType::kPlain};
+
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
   auto it = oper.Attrs().find("use_mkldnn");
   if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
   }
 #endif
-  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<framework::Tensor>(name)->type()),
       ctx.GetPlace(), layout, library);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 6ec8c9d18b466142acdb46b0f46826a2aca7a47e..d7e0af28c1bfa6a9073b25b0a301234cc5d194f5 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -111,14 +111,16 @@ class BatchNormOp : public framework::OperatorWithKernel {
                       "Variance input should be of float type");
 
     framework::LibraryType library_{framework::LibraryType::kPlain};
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
 #ifdef PADDLE_WITH_MKLDNN
     if (library_ == framework::LibraryType::kPlain &&
         platform::CanMKLDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
     }
 #endif
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
                                    library_);
   }
@@ -367,17 +369,18 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     }
 
     framework::LibraryType library_{framework::LibraryType::kPlain};
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
     if (library_ == framework::LibraryType::kPlain &&
         platform::CanMKLDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
     }
 #endif
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout, library_);
+        layout_, library_);
   }
 };
 
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 697d91484257984b104a13b0572cf19b16f8d37e..850297a2327f33a4a765f64f201e217fce5db89b 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -75,6 +75,11 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library{framework::LibraryType::kPlain};
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::DataLayout layout = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library = framework::LibraryType::kCUDNN;
@@ -84,6 +89,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
   if (library == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
   }
 #endif
 
@@ -99,9 +105,6 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
                       "float16 can only be used when CUDNN is used");
   }
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
                                  library);
 }
@@ -309,6 +312,10 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
@@ -318,12 +325,10 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
       layout_, library_);
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index 669b3bbe9df4cae1aa381184092dfa51157ab6a3..5b5a220cf90e7813f914ae35733e7a4103391b2d 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -48,6 +48,13 @@ class CropOp : public framework::OperatorWithKernel {
       ctx->SetOutputDim("Out", y_dim);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 class CropOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -60,13 +67,19 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker {
              "The input used as reference for cropping, "
              "which is of the same dimensions as X.")
         .AsDispensable();
+    AddInput("Offsets",
+             "The input used to describe offsets in runtime, which is a "
+             "1-D vector whose size equals to the rank of input 'X'. The "
+             "elements data type must be int.")
+        .AsDispensable();
     AddOutput("Out",
               "The output of crop op, "
               "which is of the same dimensions as X.");
     AddAttr<std::vector<int>>("offsets",
                               "A list<int> describing offsets to be cropped. "
                               "The size of offsets list should be the same as "
-                              "the dimension size of input X.");
+                              "the dimension size of input X.")
+        .SetDefault(std::vector<int>());
     AddAttr<std::vector<int>>("shape",
                               "A list<int> describing the shape of output. "
                               "The size of shape list should be the same as "
@@ -77,6 +90,17 @@ Crop Operator.
 
 Crop input into output, as specified by offsets and shape.
 
+There are two ways to set the offsets:
+1. In runtime: Using the input 'Offsets', which is a Vairbale and can be 
+               output of other operators. This way is suitable for 
+               dynamic offsets.
+2. In network configuration: Using the attribute 'offsets', which will be 
+                             set in Python configure script. This way is 
+                             suitable for fixed offsets.
+You CANNOT use these two ways at the same time. An exception will be raised 
+if input 'Offset' is configured and meanwhile the attribute 'offsets' is 
+not empty.
+
 There are two ways to set shape:
 1. reference input: crop input X into the same shape as reference input.
                     The dimension of reference input should
@@ -146,6 +170,15 @@ class CropOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index f05c2e23284e3a24cf48442996f671ec6084c391..91cfbbda7352c9b1676aae99e2bd57ccc9e10069 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -27,6 +27,37 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using framework::Tensor;
 
+static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
+  std::vector<int> res;
+  int rank = ctx.Input<Tensor>("X")->dims().size();
+  if (ctx.HasInput("Offsets")) {
+    PADDLE_ENFORCE(ctx.Attr<std::vector<int>>("offsets").empty(),
+                   "Input 'Offsets' and attribute 'offsets' should not be used "
+                   "at the same time.");
+    const auto* offsets_tensor = ctx.Input<Tensor>("Offsets");
+    PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1);
+    PADDLE_ENFORCE_EQ(
+        rank, offsets_tensor->dims()[0],
+        "Offsets size should be equal to dimension size of input tensor.");
+    const int* offsets_data;
+    framework::Tensor cpu_tmp_tensor;
+    if (platform::is_cpu_place(offsets_tensor->place())) {
+      offsets_data = offsets_tensor->data<int>();
+    } else {
+      framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(),
+                                &cpu_tmp_tensor);
+      offsets_data = cpu_tmp_tensor.data<int>();
+    }
+    res = std::vector<int>(offsets_data, offsets_data + rank);
+  } else {
+    res = ctx.Attr<std::vector<int>>("offsets");
+    PADDLE_ENFORCE_EQ(
+        rank, res.size(),
+        "Offsets size should be equal to dimension size of input tensor.");
+  }
+  return res;
+}
+
 template <typename T>
 class CropKernel : public framework::OpKernel<T> {
  public:
@@ -37,10 +68,7 @@ class CropKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
     auto x_stride = framework::stride(x->dims());
     auto out_stride = framework::stride(out->dims());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
-    PADDLE_ENFORCE_EQ(
-        x->dims().size(), static_cast<int64_t>(offsets.size()),
-        "Offsets size should be equal to dimension size of input tensor.");
+    auto offsets = GetOffsets(context);
     int64_t offset = 0;
     for (size_t i = 0; i < offsets.size(); ++i) {
       offset += (x_stride[i] * offsets[i]);
@@ -56,7 +84,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
   if (d_x != nullptr) {
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
     d_x->mutable_data<T>(context.GetPlace());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
+    auto offsets = GetOffsets(context);
     Eigen::array<std::pair<int, int>, D> paddings;
     for (size_t i = 0; i < D; ++i) {
       paddings[i].first = offsets[i];
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index cf20530513cf6cd420e56b2f6378225f73c2bc8b..c29dc5d7e077a73b7db6a0c9204c2029ec1392bc 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -1,6 +1,6 @@
 if(WITH_DISTRIBUTE)
   grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-      request_handler_impl.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
+      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
       selected_rows memory)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index f4d83e86ecb01eed863a387d827023a5d808dad0..fae39418b4166c9110a40b550e0b9801075edc7e 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -25,29 +25,15 @@ namespace paddle {
 namespace operators {
 namespace detail {
 
-std::once_flag RPCClient::init_flag_;
+void GRPCClient::InitImpl() { InitEventLoop(); }
 
-std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
-
-RPCClient* RPCClient::GetInstance() {
-  std::call_once(init_flag_, &RPCClient::Init);
-  return rpc_client_.get();
-}
-
-void RPCClient::Init() {
-  if (rpc_client_.get() == nullptr) {
-    rpc_client_.reset(new RPCClient());
-  }
-  rpc_client_->InitEventLoop();
-}
-
-void RPCClient::InitEventLoop() {
+void GRPCClient::InitEventLoop() {
   // start the client process thread
   // TODO(wuyi): can make this in a threadpool
-  client_thread_.reset(new std::thread(std::bind(&RPCClient::Proceed, this)));
+  client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
 }
 
-RPCClient::~RPCClient() {
+GRPCClient::~GRPCClient() {
   Wait();
   cq_.Shutdown();
   {
@@ -59,11 +45,10 @@ RPCClient::~RPCClient() {
   client_thread_->join();
 }
 
-bool RPCClient::AsyncSendVariable(const std::string& ep,
-                                  const platform::DeviceContext& ctx,
-                                  const framework::Scope& scope,
-                                  const std::string& var_name,
-                                  int64_t time_out) {
+bool GRPCClient::AsyncSendVar(const std::string& ep,
+                              const platform::DeviceContext& ctx,
+                              const framework::Scope& scope,
+                              const std::string& var_name, int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
@@ -113,11 +98,10 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
   result->Swap(&tmp);
 }
 
-bool RPCClient::AsyncGetVariable(const std::string& ep,
-                                 const platform::DeviceContext& ctx,
-                                 const framework::Scope& scope,
-                                 const std::string& var_name,
-                                 int64_t time_out) {
+bool GRPCClient::AsyncGetVar(const std::string& ep,
+                             const platform::DeviceContext& ctx,
+                             const framework::Scope& scope,
+                             const std::string& var_name, int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
@@ -155,12 +139,12 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
   return true;
 }
 
-bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& in_var_name,
-                                      const std::string& out_var_name,
-                                      int64_t time_out) {
+bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& in_var_name,
+                                  const std::string& out_var_name,
+                                  int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string in_var_name_val = in_var_name;
@@ -198,7 +182,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
   return true;
 }
 
-void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
+void GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                       int64_t time_out) {
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
@@ -211,7 +196,8 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
   req_count_++;
 }
 
-void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
+void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                       int64_t time_out) {
   const auto ch = GetChannel(ep);
   FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
   s->Prepare(time_out);
@@ -223,12 +209,12 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
   req_count_++;
 }
 
-void RPCClient::Wait() {
+void GRPCClient::Wait() {
   std::unique_lock<std::mutex> lk(sync_mutex_);
   sync_cond_.wait(lk, [this] { return req_count_ == 0; });
 }
 
-void RPCClient::Proceed() {
+void GRPCClient::Proceed() {
   void* tag = nullptr;
   bool ok = false;
 
@@ -251,7 +237,7 @@ void RPCClient::Proceed() {
   }
 }
 
-std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
+std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
   // TODO(Yancey1989): make grpc client completely thread-safe
   std::lock_guard<std::mutex> guard(chan_mutex_);
   auto it = channels_.find(ep);
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
index bb3813efcf4f77a8ec3d2f4b39969faa6216e38f..8db73f875e3e2048386e91f6b5efb29b4ee7e193 100644
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -38,6 +38,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
@@ -164,47 +165,46 @@ class FetchBarrierProcessor : public BaseProcessor {
   std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
 
-class RPCClient {
+class GRPCClient : public RPCClient {
  public:
-  RPCClient() {}
-  ~RPCClient();
+  GRPCClient() {}
+  virtual ~GRPCClient();
 
-  static RPCClient* GetInstance();
+  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
+                    const framework::Scope& scope, const std::string& var_name,
+                    int64_t time_out = RPCClient::rpc_time_out) override;
 
-  bool AsyncSendVariable(const std::string& ep,
-                         const platform::DeviceContext& ctx,
-                         const framework::Scope& scope,
-                         const std::string& var_name,
-                         int64_t time_out = 600 * 1000);
+  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
+                   const framework::Scope& scope, const std::string& var_name,
+                   int64_t time_out = RPCClient::rpc_time_out) override;
 
-  bool AsyncGetVariable(const std::string& ep,
+  bool AsyncPrefetchVar(const std::string& ep,
                         const platform::DeviceContext& ctx,
                         const framework::Scope& scope,
-                        const std::string& var_name,
-                        int64_t time_out = 600 * 1000);
+                        const std::string& in_var_name,
+                        const std::string& out_var_name,
+                        int64_t time_out = RPCClient::rpc_time_out) override;
 
-  bool AsyncPrefetchVariable(const std::string& ep,
-                             const platform::DeviceContext& ctx,
-                             const framework::Scope& scope,
-                             const std::string& in_var_name,
-                             const std::string& out_var_name,
-                             int64_t time_out = 600 * 1000);
+  void AsyncSendBatchBarrier(
+      const std::string& ep,
+      int64_t time_out = RPCClient::rpc_time_out) override;
 
-  void AsyncSendBatchBarrier(const std::string& ep,
-                             int64_t time_out = 600 * 1000);
+  void AsyncSendFetchBarrier(
+      const std::string& ep,
+      int64_t time_out = RPCClient::rpc_time_out) override;
 
-  void AsyncSendFetchBarrier(const std::string& ep,
-                             int64_t time_out = 600 * 1000);
+  void Wait() override;
 
-  void Wait();
+ protected:
+  void InitImpl() override;
+
+ private:
   // InitEventLoop should only be called by Init()
   void InitEventLoop();
 
- private:
   void Proceed();
+
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
-  // Init is called by GetInstance.
-  static void Init();
 
  private:
   grpc::CompletionQueue cq_;
@@ -218,9 +218,7 @@ class RPCClient {
 
   // mutex for GetChannel thread safety
   std::mutex chan_mutex_;
-  static std::unique_ptr<RPCClient> rpc_client_;
-  static std::once_flag init_flag_;
-  DISABLE_COPY_AND_ASSIGN(RPCClient);
+  DISABLE_COPY_AND_ASSIGN(GRPCClient);
 };
 
 }  // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc
index 22a3a8135759c04b051d4ec2d2707e6752df2de2..a1f9ba15e656a686c4bb0d81cf00dea120b8c0ad 100644
--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/detail/grpc_client.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -123,7 +124,8 @@ TEST(PREFETCH, CPU) {
   std::thread server_thread(StartServer);
   g_rpc_service->WaitServerReady();
 
-  detail::RPCClient* client = detail::RPCClient::GetInstance();
+  detail::RPCClient* client =
+      detail::RPCClient::GetInstance<detail::GRPCClient>();
   int port = g_rpc_service->GetSelectedPort();
   std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
 
@@ -137,7 +139,7 @@ TEST(PREFETCH, CPU) {
     std::string in_var_name("ids");
     std::string out_var_name("out");
 
-    client->AsyncPrefetchVariable(ep, ctx, scope, in_var_name, out_var_name);
+    client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
     client->Wait();
     auto var = scope.Var(out_var_name);
     auto value = var->GetMutable<framework::SelectedRows>()->value();
diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/detail/request_handler.h
index 4bc5e7f10ee2a8939d230fe96517bd9f56c13933..d74206aaba6a79ee06475985e642221bd84d9382 100644
--- a/paddle/fluid/operators/detail/request_handler.h
+++ b/paddle/fluid/operators/detail/request_handler.h
@@ -80,7 +80,6 @@ class RequestHandler {
   }
   framework::ProgramDesc* program() { return program_; }
   framework::Executor* executor() { return executor_; }
-  std::vector<framework::Variable*>& sparse_vars() { return sparse_vars_; }
 
   // This function processes user's rpc request.
   // The implemention is in request_handler_impl.
@@ -113,13 +112,7 @@ class RequestHandler {
   std::unordered_map<std::string,
                      std::shared_ptr<framework::ExecutorPrepareContext>>*
       grad_to_prepared_ctx_;
-
-  // Record received sparse variables, so that
-  // we could reset those after execute optimize program
-  std::vector<framework::Variable*> sparse_vars_;
   RPCServer* rpc_server_;
-
-  std::mutex sparse_var_mutex_;
 };
 
 }  // namespace detail
diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc
index f16c06d52f4fb86d51083a8b3b98d05a64c1af74..9473dce55029f2a4e0987ab8f6f5e7205d7fff47 100644
--- a/paddle/fluid/operators/detail/request_handler_impl.cc
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -63,16 +63,22 @@ bool RequestSendHandler::Handle(const std::string& varname,
       PADDLE_THROW("sync: Can not find server side var");
       return false;
     }
-
     if (invar->IsType<framework::SelectedRows>()) {
-      std::unique_lock<std::mutex> lock(sparse_var_mutex_);
+      std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
       sparse_vars_.push_back(invar);
     }
   }
-
   return true;
 }
 
+void RequestSendHandler::ResetSparseVarRecorder() {
+  std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+  for (auto* var : sparse_vars_) {
+    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+  }
+  sparse_vars_.clear();
+}
+
 bool RequestGetHandler::Handle(const std::string& varname,
                                framework::Scope* scope,
                                framework::Variable* invar,
diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/detail/request_handler_impl.h
index 8d0c62232b68ad6c05e751c25103802ee12db57e..443d951914dd0f40e8831abc637848363d9fef16 100644
--- a/paddle/fluid/operators/detail/request_handler_impl.h
+++ b/paddle/fluid/operators/detail/request_handler_impl.h
@@ -41,6 +41,11 @@ class RequestSendHandler final : public RequestHandler {
   virtual ~RequestSendHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar) override;
+  void ResetSparseVarRecorder();
+
+ private:
+  std::mutex mutex_sparse_vars_;
+  std::vector<framework::Variable*> sparse_vars_;
 };
 
 class RequestGetHandler final : public RequestHandler {
diff --git a/paddle/fluid/operators/detail/rpc_client.cc b/paddle/fluid/operators/detail/rpc_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a791403e3d6b99c5d4de5183e83e1af655d7d4c
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_client.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/detail/rpc_client.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+std::once_flag RPCClient::init_flag_;
+std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_client.h b/paddle/fluid/operators/detail/rpc_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e3717f076db6a52916d0e15813f9f61148c6553
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_client.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class RPCClient {
+ public:
+  virtual bool AsyncSendVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            int64_t time_out = rpc_time_out) = 0;
+
+  virtual bool AsyncGetVar(const std::string& ep,
+                           const platform::DeviceContext& ctx,
+                           const framework::Scope& scope,
+                           const std::string& var_name,
+                           int64_t time_out = rpc_time_out) = 0;
+
+  virtual bool AsyncPrefetchVar(const std::string& ep,
+                                const platform::DeviceContext& ctx,
+                                const framework::Scope& scope,
+                                const std::string& in_var_name,
+                                const std::string& out_var_name,
+                                int64_t time_out = rpc_time_out) = 0;
+
+  virtual void AsyncSendBatchBarrier(const std::string& ep,
+                                     int64_t time_out = rpc_time_out) = 0;
+
+  virtual void AsyncSendFetchBarrier(const std::string& ep,
+                                     int64_t time_out = rpc_time_out) = 0;
+
+  virtual void Wait() = 0;
+
+  static constexpr int64_t rpc_time_out = 120 * 1000;
+
+  template <typename T>
+  static RPCClient* GetInstance() {
+    std::call_once(init_flag_, &RPCClient::Init<T>);
+    return rpc_client_.get();
+  }
+
+  // Init is called by GetInstance.
+  template <typename T>
+  static void Init() {
+    if (rpc_client_.get() == nullptr) {
+      rpc_client_.reset(new T());
+      rpc_client_->InitImpl();
+    }
+  }
+
+ protected:
+  virtual void InitImpl() {}
+
+ private:
+  static std::once_flag init_flag_;
+  static std::unique_ptr<RPCClient> rpc_client_;
+};
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/detail/rpc_server.h
index c2e7ae706c9dc6776e09b25e424b30f110c3855d..f809c13c726ac2f1c60e8cf84848c4138f631b44 100644
--- a/paddle/fluid/operators/detail/rpc_server.h
+++ b/paddle/fluid/operators/detail/rpc_server.h
@@ -60,6 +60,7 @@ class RPCServer {
   void SetCond(const std::string& rpc_name);
   void WaitCond(const std::string& rpc_name);
   void IncreaseBatchBarrier(const std::string rpc_name);
+
   void ResetBarrierCounter();
 
  protected:
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 8843a1c44b7004ba5d7935f75d3c99d9c30fc6c0..a9ae1396db8d7dab0364779e506d5c0a3e2ff6ed 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -43,7 +43,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType FCOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library{framework::LibraryType::kMKLDNN};
-  framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
 
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
@@ -65,7 +65,7 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType FCOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library{framework::LibraryType::kMKLDNN};
-  framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
 
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc
index 1e2c93335fb9cc6b231857783743eda4e387bf39..ad67585e485b237f5d4b809af712ce658ef602bb 100644
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -43,7 +44,8 @@ class FetchBarrierOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    auto rpc_client = detail::RPCClient::GetInstance();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
 
     rpc_client->Wait();
 
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index 9c0561b016fdbfa8e48535eaa673a3f85bc936e5..f6b156eb30dae154395b34dcfc26319cd89edbca 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -43,7 +43,8 @@ TEST(Gather, GatherData) {
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
   paddle::operators::CPUGather<int>(ctx, *src, *index, output);
-
+  delete cpu_place;
+  cpu_place = NULL;
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
 
diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc
index 4bce2d322d825110a446c9bc5eccdacf0ba3c943..547de4fa49dc16182c424118c0f5705d2396100a 100644
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -61,12 +61,13 @@ class GenNCCLIdOp : public framework::OperatorBase {
 
     std::vector<std::string> endpoint_list =
         Attr<std::vector<std::string>>("endpoint_list");
-    detail::RPCClient client;
+    detail::RPCClient* client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
     for (auto& ep : endpoint_list) {
       VLOG(3) << "sending nccl id to " << ep;
-      client.AsyncSendVariable(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
+      client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
     }
-    client.Wait();
+    client->Wait();
     VLOG(3) << "sending completed...";
   }
 
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 66a0f87b46c6447bac7e42f0f61e3170cb1f2fdb..66d31c88951926a6dd9b7262942a69bb1564a416 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -108,9 +108,6 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
   rpc_service_->ResetBarrierCounter();
-  // Record received sparse variables, so that
-  // we could reset those after execute optimize program
-  std::vector<framework::Variable *> sparse_vars;
   while (true) {
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
@@ -146,18 +143,12 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
                           recv_scope);
     VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
 
-    // Reset the received sparse variables, the sum operator would not
-    // sum the input sparse variables which rows is empty at the next
-    // mini-batch.
-    // TODO(Yancey1989): move the reset action into an operator, we couldn't
-    // have any hide logic in the operator.
-    for (framework::Variable *var : sparse_vars) {
-      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-    }
-
     rpc_service_->SetCond(detail::kRequestGet);
     rpc_service_->WaitBarrier(detail::kRequestGet);
     rpc_service_->ResetBarrierCounter();
+    // reset received sparse vars to avoid reuse it in the next mini-batch
+    dynamic_cast<detail::RequestSendHandler *>(request_send_handler_.get())
+        ->ResetSparseVarRecorder();
   }  // while(true)
 }
 
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 52b9cd7fb7019b738098a8649f23277afd40e938..52b459a6a2e56b7c256efdb535b4652c64bae23c 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -124,16 +124,17 @@ namespace {
 framework::OpKernelType GetExpectedLRNKernel(
     const framework::ExecutionContext& ctx) {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_MKLDNN
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
       layout_, library_);
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 3719a264e90ea7d1a99eb9589ce4fd0d8e074781..b545671b43d3a453ab03e4774427179617f62db0 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -77,6 +77,8 @@ TEST(math_function, gemm_trans_clbas) {
   paddle::platform::CPUDeviceContext context(*cpu_place);
   GetBlas<float>(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3,
                                input2_ptr + 3, 3, 1, input3_ptr + 1, 4);
+  delete cpu_place;
+  cpu_place = NULL;
 
   EXPECT_EQ(input3_ptr[0], 0);
   EXPECT_EQ(input3_ptr[1], 24);
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc
index 60e936298defe7c6ce8a33bdc7de05b52eb950e7..a045f9e98dd7348973c3c4506f44d3e261599a14 100644
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -24,10 +24,13 @@ using mkldnn::pooling_backward;
 
 // Generate keys for storing/retriving primitives for this operator
 // TODO(jczaja): Make hashing function more optimial
-static std::string gethash(memory::dims& input_dims, std::string& pooling_type,
-                           std::vector<int>& ksize, std::vector<int>& strides,
-                           std::vector<int>& paddings, std::string suffix) {
-  auto dims2str = [](memory::dims& operand_dims) {
+static std::string gethash(const memory::dims& input_dims,
+                           const std::string& pooling_type,
+                           const std::vector<int>& ksize,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::string& suffix) {
+  auto dims2str = [](const memory::dims& operand_dims) {
     std::string dstr = "";
     for (size_t i = 0; i < operand_dims.size(); ++i) {
       dstr += std::to_string(operand_dims[i]) + "-";
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index f4fb2b132fe8d59cb50f5a1f7359240ac50445fe..18aa2bd352c5d184b5748e57b4af17c1ae0d7a82 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -83,6 +83,9 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
@@ -92,11 +95,10 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
       layout_, library_);
@@ -112,6 +114,9 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
@@ -121,6 +126,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
@@ -129,8 +135,6 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
     PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
                       "float16 can only be used when CUDNN is used");
   }
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                  library_);
 }
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
index 167a06e090c1d5a15f502098e5fe4968693bcc04..d96359d6befa68bdd0c255dde9c63bfc7fffc0a5 100644
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -41,14 +41,14 @@ class PrefetchOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    auto rpc_client = detail::RPCClient::GetInstance();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
                 << outs[i] << " back";
-        rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i],
-                                          outs[i]);
+        rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
index b14b559e31dd422f8ebe4002988a9746dfdf28a2..528a6e4a1b68fe611d104f21bafe970762611a03 100644
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -20,7 +20,6 @@ class RandomCropOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
@@ -36,11 +35,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Seed", "The random seed.");
     AddOutput("Out", "The cropped instance batch.");
     AddOutput("SeedOut", "The random seed after random cropping.")
-        .AsDispensable();
+        .AsIntermediate();
     AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
     AddComment(R"DOC(
-      This operator takes a batch of instance, and do random cropping on each instance. 
-      It means that cropping positions differs on each instance, which is determined 
+      This operator takes a batch of instance, and do random cropping on each instance.
+      It means that cropping positions differs on each instance, which is determined
       by an uniform random generator. All cropped instances have the same shape, which 
       is determined by the operator's attribute 'shape'.
     )DOC");
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 49b480948a788dc22f95a4eafc6f780298d7c5f9..1ea1cc458b2a20017cc36457af81387a8c808642 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -44,11 +44,12 @@ class RecvOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    auto rpc_client = detail::RPCClient::GetInstance();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
 
     for (size_t i = 0; i < outs.size(); i++) {
       VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-      rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+      rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
     }
     if (sync_mode) {
       rpc_client->Wait();
diff --git a/paddle/fluid/operators/reduce_max_op.cc b/paddle/fluid/operators/reduce_max_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95d3768e1fdf6947659c7b3a1c9d57fad741472a
--- /dev/null
+++ b/paddle/fluid/operators/reduce_max_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_REDUCE_OP(reduce_max);
+REGISTER_OP_CPU_KERNEL(
+    reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MaxFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0d86b3127e42f7ee14ba57b1c762e8128a0f2d54
--- /dev/null
+++ b/paddle/fluid/operators/reduce_max_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_max,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MaxFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.cc b/paddle/fluid/operators/reduce_mean_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc258c2496340b47d24dc89f16f7419dbb4b0d95
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_mean_op.h"
+
+REGISTER_REDUCE_OP(reduce_mean);
+REGISTER_OP_CPU_KERNEL(reduce_mean,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int64_t, ops::MeanFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_mean_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..960cb3235be7f4cc98b97d3b088ceaeb3d4a4209
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_mean_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_mean,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MeanFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.h b/paddle/fluid/operators/reduce_mean_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1359679c4767d2032bf3e3a90849ad2a2ef3e829
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct MeanFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->mean(dim);
+  }
+};
+
+struct MeanGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_min_max_op.h b/paddle/fluid/operators/reduce_min_max_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec59f3e71c1c702655a3feed10935b2f5a29d8a8
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_max_op.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
+  }
+};
+
+struct MinFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->minimum(dim);
+  }
+};
+
+struct MaxOrMinGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
+    // If there are multiple minimum or maximum elements, the subgradient of
+    // each is the set [0, 1], and we pass gradient to all of them here.
+    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_min_op.cc b/paddle/fluid/operators/reduce_min_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..330a86d2e4237a10d8cf6fd40025540edf08d897
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_REDUCE_OP(reduce_min);
+REGISTER_OP_CPU_KERNEL(
+    reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MinFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..da466f805eff4709dc23471baef03e94052ee6c1
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_min,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MinFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc
deleted file mode 100644
index e293fd5e410b2a34b3c71ea674607ba9d7654535..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_op.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_op.h"
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class ReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReduceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReduceOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      PADDLE_ENFORCE_LT(
-          dims[i], x_rank,
-          "The dim should be in the range [-rank(input), rank(input)).");
-    }
-    sort(dims.begin(), dims.end());
-    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
-    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
-    if (reduce_all) {
-      if (keep_dim)
-        ctx->SetOutputDim(
-            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
-      else
-        ctx->SetOutputDim("Out", {1});
-    } else {
-      auto dims_vector = vectorize(x_dims);
-      if (keep_dim) {
-        for (size_t i = 0; i < dims.size(); ++i) {
-          dims_vector[dims[i]] = 1;
-        }
-      } else {
-        const int kDelFlag = -2;
-        for (size_t i = 0; i < dims.size(); ++i) {
-          dims_vector[dims[i]] = kDelFlag;
-        }
-        dims_vector.erase(
-            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-            dims_vector.end());
-      }
-      auto out_dims = framework::make_ddim(dims_vector);
-      ctx->SetOutputDim("Out", out_dims);
-      if (dims[0] != 0) {
-        // Only pass LoD when not reducing on the first dim.
-        ctx->ShareLoD("X", /*->*/ "Out");
-      }
-    }
-  }
-};
-
-class ReduceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      PADDLE_ENFORCE_LT(
-          dims[i], x_rank,
-          "The dim should be in the range [-rank(input), rank(input)).");
-    }
-    sort(dims.begin(), dims.end());
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
-    }
-  }
-};
-
-class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final {
-    AddInput("X",
-             "(Tensor) The input tensor. Tensors with rank at most 6 are "
-             "supported.");
-    AddOutput("Out", "(Tensor) The result tensor.");
-    AddAttr<std::vector<int>>(
-        "dim",
-        "(list<int>, default {0}) The dimensions to reduce. "
-        "Must be in the range [-rank(input), rank(input)). "
-        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
-        "Note that reducing on the first dim will make the LoD info lost.")
-        .SetDefault({0});
-    AddAttr<bool>("keep_dim",
-                  "(bool, default false) "
-                  "If true, retain the reduced dimension with length 1.")
-        .SetDefault(false);
-    AddAttr<bool>("reduce_all",
-                  "(bool, default false) "
-                  "If true, output a scalar reduced along all dimensions.")
-        .SetDefault(false);
-    AddComment(string::Sprintf(R"DOC(
-%s Operator.
-
-This operator computes the %s of input tensor along the given dimension.
-The result tensor has 1 fewer dimension than the input unless keep_dim is true.
-If reduce_all is true, just reduce along all dimensions and output a scalar.
-
-)DOC",
-                               GetOpType(), GetName()));
-  }
-
- protected:
-  virtual std::string GetName() const = 0;
-  virtual std::string GetOpType() const = 0;
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-#define REGISTER_REDUCE_OP(op_name)                                        \
-  class __##op_name##Maker__ : public ops::ReduceOpMaker {                 \
-   protected:                                                              \
-    virtual std::string GetName() const { return #op_name; }               \
-    virtual std::string GetOpType() const { return "Reduce " #op_name; }   \
-  };                                                                       \
-  REGISTER_OPERATOR(reduce_##op_name, ops::ReduceOp, __##op_name##Maker__, \
-                    paddle::framework::DefaultGradOpDescMaker<true>);      \
-  REGISTER_OPERATOR(reduce_##op_name##_grad, ops::ReduceGradOp)
-
-REGISTER_REDUCE_OP(sum);
-REGISTER_REDUCE_OP(mean);
-REGISTER_REDUCE_OP(max);
-REGISTER_REDUCE_OP(min);
-REGISTER_REDUCE_OP(prod);
-
-#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
-  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           float, ops::functor>,               \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           double, ops::functor>,              \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int, ops::functor>,                 \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int64_t, ops::functor>);            \
-  REGISTER_OP_CPU_KERNEL(                                                      \
-      reduce_type##_grad,                                                      \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,        \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,           \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,       \
-                            ops::grad_functor>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
diff --git a/paddle/fluid/operators/reduce_op.cu b/paddle/fluid/operators/reduce_op.cu
deleted file mode 100644
index ae29587f55847315b1d84f1344677e753fe01a9b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_op.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/fluid/operators/reduce_op.h"
-
-namespace ops = paddle::operators;
-
-#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
-                                     float, ops::functor>,                \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, double,      \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int,         \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t,     \
-                        ops::functor>);                                   \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type##_grad,                                                 \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,  \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,     \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
-                            ops::grad_functor>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h
index 7df47f316c30b9eb2644677681b91023e1838548..72b6cf1773d5bcc42e40e72111179d454d2bb4a9 100644
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
@@ -14,105 +14,20 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <string>
 #include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
+
+#include "paddle/fluid/operators/reduce_op_function.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-struct SumFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->sum(dim);
-  }
-};
-
-struct SumGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim);
-  }
-};
-
-struct MeanFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->mean(dim);
-  }
-};
-
-struct MeanGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
-  }
-};
-
-struct MaxFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->maximum(dim);
-  }
-};
-
-struct MinFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->minimum(dim);
-  }
-};
-
-struct MaxOrMinGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    auto equals = (*x) == y->broadcast(dim);
-    auto ones = dx->constant(1);
-    auto zeros = dx->constant(0);
-    // If there are multiple minimum or maximum elements, the subgradient of
-    // each is the set [0, 1], and we pass gradient to all of them here.
-    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
-  }
-};
-
-struct ProdFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->prod(dim);
-  }
-};
-
-struct ProdGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
-  }
-};
-
-#define HANDLE_DIM(NDIM, RDIM)          \
-  if (ndim == NDIM && rdim == RDIM) {   \
-    ReduceCompute<NDIM, RDIM>(context); \
+#define HANDLE_DIM(NDIM, RDIM)                                            \
+  if (ndim == NDIM && rdim == RDIM) {                                     \
+    ReduceFunctor<DeviceContext, T, NDIM, RDIM, Functor>(                 \
+        context.template device_context<DeviceContext>(), *input, output, \
+        dims, keep_dim);                                                  \
   }
 
 template <typename DeviceContext, typename T, typename Functor>
@@ -120,11 +35,15 @@ class ReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
     if (reduce_all) {
       // Flatten and reduce 1-D tensor
-      auto* input = context.Input<Tensor>("X");
-      auto* output = context.Output<Tensor>("Out");
-      output->mutable_data<T>(context.GetPlace());
       auto x = EigenVector<T>::Flatten(*input);
       auto out = EigenScalar<T>::From(*output);
       auto& place =
@@ -133,8 +52,8 @@ class ReduceKernel : public framework::OpKernel<T> {
       Functor functor;
       functor(place, &x, &out, reduce_dim);
     } else {
-      int ndim = context.Input<Tensor>("X")->dims().size();
-      int rdim = context.Attr<std::vector<int>>("dim").size();
+      int ndim = input->dims().size();
+      int rdim = dims.size();
       // comments for accelerating compiling temporarily.
       //      HANDLE_DIM(6, 5);
       //      HANDLE_DIM(6, 4);
@@ -154,48 +73,6 @@ class ReduceKernel : public framework::OpKernel<T> {
       HANDLE_DIM(1, 1);
     }
   }
-
- private:
-  template <size_t D, size_t R_D>
-  void ReduceCompute(const framework::ExecutionContext& context) const {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-
-    auto x = EigenTensor<T, D>::From(*input);
-    auto x_rank = static_cast<int>(x.dimensions().size());
-    auto dims = context.Attr<std::vector<int>>("dim");
-    auto reduce_dim = Eigen::array<int, R_D>();
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      reduce_dim[i] = dims[i];
-    }
-    // construct the squeezed output tensor
-    bool keep_dim = context.Attr<bool>("keep_dim");
-    DDim out_dims = output->dims();
-    if (keep_dim && x_rank > 1) {
-      const int kDelFlag = -2;
-      auto dims_vector = vectorize(out_dims);
-      for (size_t i = 0; i < dims.size(); ++i) {
-        dims_vector[dims[i]] = kDelFlag;
-      }
-      dims_vector.erase(
-          remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-          dims_vector.end());
-      out_dims = framework::make_ddim(dims_vector);
-    }
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-
-    if (D == 1) {
-      auto out = EigenScalar<T>::From(*output);
-      functor(place, &x, &out, reduce_dim);
-    } else {
-      auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
-      functor(place, &x, &out, reduce_dim);
-    }
-  }
 };
 
 template <typename DeviceContext, typename T, typename Functor>
@@ -203,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool reduce_all = context.Attr<bool>("reduce_all");
+    auto dims = context.Attr<std::vector<int>>("dim");
+
+    auto* input0 = context.Input<Tensor>("X");
+    auto* input1 = context.Input<Tensor>("Out");
+    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+    output->mutable_data<T>(context.GetPlace());
+
     if (reduce_all) {
-      auto* input0 = context.Input<Tensor>("X");
-      auto* input1 = context.Input<Tensor>("Out");
-      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
-      output->mutable_data<T>(context.GetPlace());
       auto x = EigenVector<T>::Flatten(*input0);
       auto x_reduce = EigenVector<T>::From(*input1);
       auto x_reduce_grad = EigenVector<T>::From(*input2);
@@ -221,74 +101,172 @@ class ReduceGradKernel : public framework::OpKernel<T> {
       functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
               broadcast_dim[0]);
     } else {
-      int rank = context.Input<Tensor>("X")->dims().size();
+      int rank = input0->dims().size();
       switch (rank) {
         case 1:
-          ReduceGradCompute<1>(context);
+          ReduceGradFunctor<DeviceContext, T, 1, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 2:
-          ReduceGradCompute<2>(context);
+          ReduceGradFunctor<DeviceContext, T, 2, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 3:
-          ReduceGradCompute<3>(context);
+          ReduceGradFunctor<DeviceContext, T, 3, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 4:
-          ReduceGradCompute<4>(context);
+          ReduceGradFunctor<DeviceContext, T, 4, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 5:
-          ReduceGradCompute<5>(context);
+          ReduceGradFunctor<DeviceContext, T, 5, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 6:
-          ReduceGradCompute<6>(context);
+          ReduceGradFunctor<DeviceContext, T, 6, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
       }
     }
   }
+};
 
- private:
-  template <size_t D>
-  void ReduceGradCompute(const framework::ExecutionContext& context) const {
-    auto* input0 = context.Input<Tensor>("X");
-    auto* input1 = context.Input<Tensor>("Out");
-    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+class ReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-    output->mutable_data<T>(context.GetPlace());
-    auto x = EigenTensor<T, D>::From(*input0);
-    auto x_grad = EigenTensor<T, D>::From(*output);
-    auto x_rank = static_cast<int>(x.dimensions().size());
-    auto dims = context.Attr<std::vector<int>>("dim");
-    auto x_dims = input0->dims();
-    auto reduced_dims_v = vectorize(x_dims);
-    Eigen::array<int, D> broadcast_dim;
-    for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReduceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReduceOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      PADDLE_ENFORCE_LT(
+          dims[i], x_rank,
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+    if (reduce_all) {
+      if (keep_dim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = vectorize(x_dims);
+      if (keep_dim) {
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = 1;
+        }
+      } else {
+        const int kDelFlag = -2;
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = kDelFlag;
+        }
+        dims_vector.erase(
+            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+            dims_vector.end());
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (dims[0] != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+};
+
+class ReduceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-    int broad_cats_times = 1;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
     for (size_t i = 0; i < dims.size(); ++i) {
       if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      reduced_dims_v[dims[i]] = 1;
-      broadcast_dim[dims[i]] = x_dims[dims[i]];
-      broad_cats_times *= x_dims[dims[i]];
+      PADDLE_ENFORCE_LT(
+          dims[i], x_rank,
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
     }
-    auto reduced_dims = framework::make_ddim(reduced_dims_v);
-    auto x_reduce = EigenTensor<T, D>::From(*input1, reduced_dims);
-    auto x_reduce_grad = EigenTensor<T, D>::From(*input2, reduced_dims);
+  }
+};
+
+class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() final {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 6 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<std::vector<int>>(
+        "dim",
+        "(list<int>, default {0}) The dimensions to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault({0});
+    AddAttr<bool>("keep_dim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+%s Operator.
 
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
+This operator computes the %s of input tensor along the given dimension.
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
 
-    Functor functor;
-    functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
-            broad_cats_times);
+)DOC",
+                               GetOpType(), GetName()));
   }
+
+ protected:
+  virtual std::string GetName() const = 0;
+  virtual std::string GetOpType() const = 0;
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                \
-  __macro(reduce_sum, SumFunctor, SumGradFunctor);      \
-  __macro(reduce_mean, MeanFunctor, MeanGradFunctor);   \
-  __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \
-  __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \
-  __macro(reduce_prod, ProdFunctor, ProdGradFunctor);
+namespace ops = paddle::operators;
+
+#define REGISTER_REDUCE_OP(op_name)                                      \
+  class __##op_name##Maker__ : public ops::ReduceOpMaker {               \
+   protected:                                                            \
+    virtual std::string GetName() const { return #op_name; }             \
+    virtual std::string GetOpType() const { return "Reduce " #op_name; } \
+  };                                                                     \
+  REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__,        \
+                    paddle::framework::DefaultGradOpDescMaker<true>);    \
+  REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
diff --git a/paddle/fluid/operators/reduce_op_function.h b/paddle/fluid/operators/reduce_op_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..3da27bc8ac8d448471b9ff3779ac6aca59fac523
--- /dev/null
+++ b/paddle/fluid/operators/reduce_op_function.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T, size_t D, size_t R_D,
+          typename Functor>
+void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input,
+                   framework::Tensor* output, const std::vector<int>& dims,
+                   bool keep_dim) {
+  auto x = EigenTensor<T, D>::From(input);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto reduce_dim = Eigen::array<int, R_D>();
+  std::vector<int> dims_ref = dims;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
+    reduce_dim[i] = dims_ref[i];
+  }
+  // construct the squeezed output tensor
+  DDim out_dims = output->dims();
+  if (keep_dim && x_rank > 1) {
+    const int kDelFlag = -2;
+    auto dims_vector = framework::vectorize(out_dims);
+    for (size_t i = 0; i < dims_ref.size(); ++i) {
+      dims_vector[dims_ref[i]] = kDelFlag;
+    }
+    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+                      dims_vector.end());
+    out_dims = framework::make_ddim(dims_vector);
+  }
+  auto& place = *context.eigen_device();
+  Functor functor;
+
+  if (D == 1) {
+    auto out = EigenScalar<T>::From(*output);
+    functor(place, &x, &out, reduce_dim);
+  } else {
+    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
+    functor(place, &x, &out, reduce_dim);
+  }
+}
+
+template <typename DeviceContext, typename T, size_t D, typename Functor>
+void ReduceGradFunctor(const DeviceContext& context,
+                       const framework::Tensor& input0,
+                       const framework::Tensor& input1,
+                       const framework::Tensor& input2,
+                       framework::Tensor* output,
+                       const std::vector<int>& dims) {
+  auto x = EigenTensor<T, D>::From(input0);
+  auto x_grad = EigenTensor<T, D>::From(*output);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto x_dims = input0.dims();
+  auto reduced_dims_v = framework::vectorize(x_dims);
+  std::vector<int> dims_ref = dims;
+  Eigen::array<int, D> broadcast_dim;
+  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+
+  int broad_cats_times = 1;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) {
+      dims_ref[i] = x_rank + dims_ref[i];
+    }
+    reduced_dims_v[dims_ref[i]] = 1;
+    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
+    broad_cats_times *= x_dims[dims_ref[i]];
+  }
+  auto reduced_dims = framework::make_ddim(reduced_dims_v);
+  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
+  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
+
+  auto& place = *context.eigen_device();
+
+  Functor functor;
+  functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
+          broad_cats_times);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_prod_op.cc b/paddle/fluid/operators/reduce_prod_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..713728b99757a6f3bb128f665d5576ac64eef8ec
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_prod_op.h"
+
+REGISTER_REDUCE_OP(reduce_prod);
+REGISTER_OP_CPU_KERNEL(reduce_prod,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int64_t, ops::ProdFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d62e677d92cffecf629d1684026b0c7bcfec29e3
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_prod_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_prod,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::ProdFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.h b/paddle/fluid/operators/reduce_prod_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..97748113e092719aceed9d806ca6242077111532
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct ProdFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->prod(dim);
+  }
+};
+
+struct ProdGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5b5398787b44e658b0f8390162df0e6c3006651
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_sum_op.h"
+
+REGISTER_REDUCE_OP(reduce_sum);
+REGISTER_OP_CPU_KERNEL(
+    reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::SumFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f2e16955a50dc6a7feda9fbaf968c929ef3d8a4f
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_sum_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_sum,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::SumFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e67d7e1da5f0244d2dee346873692a80cbad2fc4
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct SumFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->sum(dim);
+  }
+};
+
+struct SumGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a20f7d231fa9ea313581ac0629a87fa5f4a88ce5
--- /dev/null
+++ b/paddle/fluid/operators/reverse_op.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reverse_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class ReverseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    const auto& x_dims = ctx->GetInputDim("X");
+    const auto& axis = ctx->Attrs().Get<std::vector<int>>("axis");
+    PADDLE_ENFORCE(!axis.empty(), "'axis' can not be empty.");
+    for (int a : axis) {
+      PADDLE_ENFORCE_LT(a, x_dims.size(),
+                        "The axis must be less than input tensor's rank.");
+    }
+    ctx->SetOutputDim("Out", x_dims);
+  }
+};
+
+class ReverseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The LoDTensor to be flipped.");
+    AddOutput("Out", "The LoDTensor after flipping.");
+    AddAttr<std::vector<int>>(
+        "axis", "The axises that along which order of elements is reversed.");
+    AddComment(R"DOC(
+      Reverse Operator.
+
+      Reverse the order of elements in the input LoDTensor along given axises.
+
+      Case 1:
+        Given
+            X = [[1, 2, 3, 4, 5]
+                 [6, 7, 8, 9, 10]
+                 [11, 12, 13, 14, 15]],
+        and
+            axis = [0],
+        we get:
+            Out = [[11, 12, 13, 14, 15]
+                   [6, 7, 8, 9, 10]
+                   [1, 2, 3, 4, 5]].
+        
+      Case 2:
+        Given
+            X = [[[1, 2, 3, 4]
+                  [5, 6, 7, 8]]
+                 [[9, 10, 11, 12]
+                  [13, 14, 15, 16]]],
+        and
+            axis = [0, 2],
+        we get:
+            Out = [[[12, 11, 10, 9]
+                    [16, 15, 14, 13]]
+                   [[4, 3, 2, 1]
+                    [8, 7, 6, 5]]],
+    )DOC");
+  }
+};
+
+class ReverseGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("reverse");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("axis", GetAttr("axis"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(reverse, ops::ReverseOp, ops::ReverseOpMaker,
+                  ops::ReverseGradMaker);
+REGISTER_OPERATOR(reverse_grad, ops::ReverseOp);
+REGISTER_OP_CPU_KERNEL(
+    reverse, ops::ReverseKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/reverse_op.cu b/paddle/fluid/operators/reverse_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..635c41529b38f2dd287b00ed2e5659e11f619e78
--- /dev/null
+++ b/paddle/fluid/operators/reverse_op.cu
@@ -0,0 +1,24 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reverse_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    reverse, ops::ReverseKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, double>)
diff --git a/paddle/fluid/operators/reverse_op.h b/paddle/fluid/operators/reverse_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9063cd59bba5c6307b55a500455908a5fd278390
--- /dev/null
+++ b/paddle/fluid/operators/reverse_op.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T, int Rank>
+struct ReverseFunctor {
+  void operator()(const DeviceContext& context, const framework::LoDTensor& in,
+                  framework::LoDTensor* out, const std::vector<int>& axis) {
+    Eigen::array<bool, Rank> reverse_axis;
+    for (int i = 0; i < Rank; ++i) {
+      reverse_axis[i] = false;
+    }
+    for (int a : axis) {
+      reverse_axis[a] = true;
+    }
+
+    auto in_eigen = framework::EigenTensor<T, Rank>::From(in);
+    auto out_eigen = framework::EigenTensor<T, Rank>::From(*out);
+    auto* dev = context.eigen_device();
+
+    out_eigen.device(*dev) = in_eigen.reverse(reverse_axis);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReverseKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+    const auto& axis = context.Attr<std::vector<int>>("axis");
+    int rank = x->dims().size();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    switch (rank) {
+      case 1:
+        ReverseFunctor<DeviceContext, T, 1> functor1;
+        functor1(dev_ctx, *x, out, axis);
+        break;
+      case 2:
+        ReverseFunctor<DeviceContext, T, 2> functor2;
+        functor2(dev_ctx, *x, out, axis);
+        break;
+      case 3:
+        ReverseFunctor<DeviceContext, T, 3> functor3;
+        functor3(dev_ctx, *x, out, axis);
+        break;
+      case 4:
+        ReverseFunctor<DeviceContext, T, 4> functor4;
+        functor4(dev_ctx, *x, out, axis);
+        break;
+      case 5:
+        ReverseFunctor<DeviceContext, T, 5> functor5;
+        functor5(dev_ctx, *x, out, axis);
+        break;
+      case 6:
+        ReverseFunctor<DeviceContext, T, 6> functor6;
+        functor6(dev_ctx, *x, out, axis);
+        break;
+      default:
+        PADDLE_THROW(
+            "Reserve operator doesn't supports tensors whose ranks are greater "
+            "than 6.");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index 2bc38ff4e3e6ee32bb2b0dbf4daa6d871dbaebfd..511ad753876dd26a4d6bc8c2c727c7c9253ce59c 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -44,7 +44,8 @@ class SendBarrierOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    auto rpc_client = detail::RPCClient::GetInstance();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
 
     VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
 
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index a91b1453896f951be58797071d9a5928633ccdcf..9697579707f1a510ba7db8a1a9616b59918b971b 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -49,12 +49,13 @@ class SendOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    auto rpc_client = detail::RPCClient::GetInstance();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
@@ -72,7 +73,7 @@ class SendOp : public framework::OperatorBase {
     if (outs.size() > 0) {
       for (size_t i = 0; i < outs.size(); i++) {
         VLOG(2) << "getting " << outs[i] << " from " << epmap[i];
-        rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+        rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
       }
       rpc_client->Wait();
       // tell pservers that current trainer have called fetch
diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc
index fe839dab6924618c8a4c39868d9bf86056a0be40..564e40461f8f894cffab11e26cc538b7964b6f19 100644
--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
@@ -45,14 +45,15 @@ class SendVarsOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    auto rpc_client = detail::RPCClient::GetInstance();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
         // TODO(Yancey1989): we need to use an IO threadpool which has
         // a larger number of threads than the computing threadpool.
-        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index cc256aa627bdda0609f496cab93a2dec7d95f348..c90a3be964a3a309a182d3620abec619c366dd84 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -49,6 +49,9 @@ class SoftmaxOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
     framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
     if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
@@ -58,6 +61,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     if (library_ == framework::LibraryType::kPlain &&
         platform::CanMKLDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
     }
 #endif
 
@@ -68,9 +72,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
                      "float16 can only be used on GPU place");
     }
 
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                   framework::StringToDataLayout(data_format),
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                    library_);
   }
 };
@@ -142,6 +144,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
     framework::LibraryType library_{framework::LibraryType::kPlain};
+
 #ifdef PADDLE_WITH_CUDA
     if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index 855157e7c4c5c4a43091d28d3a5414e6e386b727..4b1208c4376b48e25866fc510f3a6d2ea06e7610 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -17,23 +17,93 @@
 #include "paddle/fluid/operators/tensorrt_engine_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
 namespace operators {
 
+using inference::Singleton;
+using inference::tensorrt::TRT_EngineManager;
+
+using FluidDT = framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+
+namespace {
+
+TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+      return TRT_DT::kINT32;
+    default:
+      return TRT_DT::kINT32;
+  }
+  PADDLE_THROW("unkown type");
+  return TRT_DT::kINT32;
+}
+
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
+  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+                    "TensorRT' tensor input requires at least 2 dimensions");
+  PADDLE_ENFORCE_LE(shape.size(), 4UL,
+                    "TensorRT' tensor input requires at most 4 dimensions");
+
+  switch (shape.size()) {
+    case 2:
+      return nvinfer1::Dims2(shape[0], shape[1]);
+    case 3:
+      return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
+    case 4:
+      return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]);
+    default:
+      return nvinfer1::Dims();
+  }
+  return nvinfer1::Dims();
+}
+
+}  // namespace
+
 template <typename DeviceContext, typename T>
 void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
     const framework::ExecutionContext &context) const {
+  VLOG(4) << "Prepare engine";
   // Get the ProgramDesc and pass to convert.
-  const auto &block = context.Attr<framework::proto::BlockDesc>("subgraph");
+  framework::proto::BlockDesc block_desc;
+  block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
   max_batch_ = context.Attr<int>("max_batch");
   auto max_workspace = context.Attr<int>("max_workspace");
-  engine_.reset(new inference::tensorrt::TensorRTEngine(
-      max_batch_, max_workspace, nullptr));
+  engine_ = Singleton<TRT_EngineManager>::Global().Create(
+      max_batch_, max_workspace, &stream_);
+  engine_->InitNetwork();
+
+  framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+  // Add inputs
+  VLOG(4) << "declare inputs";
+  for (auto &input : context.Inputs("Xs")) {
+    VLOG(4) << "declare input " << input;
+    auto *var = block.FindVar(input);
+    PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+                      "TensorRT engine only takes LoDTensor as input");
+    auto shape = var->GetShape();
+    engine_->DeclareInput(
+        input, FluidDataType2TRT(
+                   var->Proto()->type().lod_tensor().tensor().data_type()),
+        Vec2TRT_Dims(var->GetShape()));
+  }
+
   // TODO(Superjomn) parameters should be passed after analysised from outside.
   inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block, {}, context.scope(), engine_.get());
+      block_desc, {}, context.scope(), engine_);
+
+  // Add outputs
+  VLOG(4) << "declare outputs";
+  for (auto &output : context.Outputs("Ys")) {
+    VLOG(4) << "declare output " << output;
+    engine_->DeclareOutput(output);
+  }
+
   engine_->FreezeNetwork();
 }
 
@@ -42,7 +112,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("Xs", "A list of inputs.").AsDuplicable();
     AddOutput("Ys", "A list of outputs").AsDuplicable();
-    AddAttr<std::string>("subgraph", "the subgraph");
+    AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<int>("max_batch", "the maximum batch size.");
+    AddAttr<int>("max_workspace", "the maximum batch size.");
     AddComment("TensorRT engine operator.");
   }
 };
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index fe273d386c529be3df05a955f492e2c39d4d8812..4b089601ff76eedd87bb3a52a38c4d22d4a94bf6 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -32,9 +32,12 @@ class TensorRTEngineOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
+    auto input0 = ctx.Inputs("Xs").front();
     framework::OpKernelType kt = framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>("pre_ids")->type()),
+        framework::ToDataType(ctx.scope()
+                                  .FindVar(input0)
+                                  ->GetMutable<framework::LoDTensor>()
+                                  ->type()),
         platform::CPUPlace());
     return kt;
   }
@@ -50,17 +53,16 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     auto input_names = context.op().Inputs("Xs");
     PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
     // Try to determine a batch_size
-    auto* tensor0 = context.Input<framework::LoDTensor>(input_names.front());
-    PADDLE_ENFORCE_NOT_NULL(tensor0);
-    int batch_size = tensor0->dims()[0];
+    auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
+        context.scope(), input_names.front());
+    int batch_size = tensor0.dims()[0];
     PADDLE_ENFORCE_LE(batch_size, max_batch_);
 
     // Convert input tensor from fluid to engine.
     for (const auto& x : context.Inputs("Xs")) {
       // convert input and copy to TRT engine's buffer
-      auto* v = context.scope().FindVar(x);
-      PADDLE_ENFORCE_NOT_NULL(v, "no variable called %s", x);
-      auto& t = v->Get<framework::LoDTensor>();
+      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
+          context.scope(), x);
       if (platform::is_cpu_place(t.place())) {
         engine_->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
                                  t.memory_size());
@@ -86,13 +88,18 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       fluid_t->Resize(framework::make_ddim(ddim));
       auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
       if (platform::is_cpu_place(fluid_t->place())) {
+        // TODO(Superjomn) change this float to dtype size.
         engine_->GetOutputInCPU(
-            y, fluid_t->mutable_data<float>(platform::CPUPlace()), size);
+            y, fluid_t->mutable_data<float>(platform::CPUPlace()),
+            size * sizeof(float));
       } else {
         engine_->GetOutputInGPU(
-            y, fluid_t->mutable_data<float>(platform::CUDAPlace()), size);
+            y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
+            size * sizeof(float));
       }
     }
+
+    cudaStreamSynchronize(stream_);
   }
 
  protected:
@@ -100,7 +107,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
   void Prepare(const framework::ExecutionContext& context) const;
 
  private:
-  mutable std::unique_ptr<inference::tensorrt::TensorRTEngine> engine_;
+  mutable cudaStream_t stream_;
+  mutable inference::tensorrt::TensorRTEngine* engine_{nullptr};
   mutable int max_batch_{0};
 };
 
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6f383de259b270038c32296b59007f6c7d895f12
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -0,0 +1,152 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+USE_CPU_ONLY_OP(tensorrt_engine);
+
+namespace paddle {
+namespace operators {
+
+namespace {
+void CreateCPUTensor(framework::Scope* scope, const std::string& name,
+                     const std::vector<int64_t>& shape) {
+  auto* var = scope->Var(name);
+  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  auto dims = framework::make_ddim(shape);
+  tensor->Resize(dims);
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  inference::tensorrt::RandomizeTensor(tensor, place, ctx);
+}
+
+void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
+                          const std::string& name,
+                          const std::vector<int64_t>& shape) {
+  using framework::proto::VarType;
+  auto* var = block->add_vars();
+  framework::VarDesc desc(name);
+  desc.SetType(VarType::LOD_TENSOR);
+  desc.SetDataType(VarType::FP32);
+  desc.SetShape(shape);
+  *var = *desc.Proto();
+}
+
+template <typename T>
+void SetAttr(framework::proto::OpDesc* op, const std::string& name,
+             const T& data);
+
+template <>
+void SetAttr<std::string>(framework::proto::OpDesc* op, const std::string& name,
+                          const std::string& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s(data);
+}
+template <>
+void SetAttr<int>(framework::proto::OpDesc* op, const std::string& name,
+                  const int& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(data);
+}
+template <>
+void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
+                      const int64_t& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::LONG);
+  attr->set_l(data);
+}
+
+}  // namespace
+
+TEST(TensorRTEngineOp, manual) {
+  framework::ProgramDesc program;
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+  LOG(INFO) << "create mul op";
+  auto* mul = block_desc.AppendOp();
+  mul->SetType("mul");
+  mul->SetInput("X", std::vector<std::string>({"x"}));     // 2 x 4
+  mul->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
+  mul->SetOutput("Out", std::vector<std::string>({"z"}));  // 2 x 6
+
+  LOG(INFO) << "create fc op";
+  auto* fc = block_desc.AppendOp();
+  fc->SetType("mul");
+  fc->SetInput("X", std::vector<std::string>({"z"}));
+  fc->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
+  fc->SetOutput("Out", std::vector<std::string>({"z0"}));  // 2 x 8
+
+  // Set inputs' variable shape in BlockDesc
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4}));
+  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
+  AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
+  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
+
+  // It is wired, need to copy manually.
+  *block_->add_ops() = *mul->Proto();
+  *block_->add_ops() = *fc->Proto();
+
+  ASSERT_EQ(block_->ops_size(), 2);
+
+  LOG(INFO) << "create tensorrt desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("tensorrt_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x", "y", "y0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
+  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
+                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 30);
+  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
+
+  LOG(INFO) << "create engine op";
+  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  // Prepare variables.
+  CreateCPUTensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  CreateCPUTensor(&scope, "y", std::vector<int64_t>({4, 6}));
+  CreateCPUTensor(&scope, "z", std::vector<int64_t>({2, 6}));
+
+  CreateCPUTensor(&scope, "y0", std::vector<int64_t>({6, 8}));
+  CreateCPUTensor(&scope, "z0", std::vector<int64_t>({2, 8}));
+
+  // Execute them.
+  LOG(INFO) << "engine_op run";
+  engine_op->Run(scope, place);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_TRT_CONVERTER(mul)
+USE_TRT_CONVERTER(fc)
diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc
index eb01ac9b9072b1bbd4115d60a2101d2f1cbcf93a..a01a75cbb070f7f835b93f5ddc62f5aad5a5e667 100644
--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
@@ -87,9 +87,10 @@ TEST(SendNcclId, GrpcServer) {
   int port = g_rpc_service->GetSelectedPort();
 
   std::string ep = string::Sprintf("127.0.0.1:%d", port);
-  detail::RPCClient* client = detail::RPCClient::GetInstance();
-  LOG(INFO) << "connect to server " << ep;
-  client->AsyncSendVariable(ep, dev_ctx, scope, NCCL_ID_VARNAME);
+  detail::RPCClient* client =
+      detail::RPCClient::GetInstance<detail::GRPCClient>();
+  LOG(INFO) << "connect to server" << ep;
+  client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
   client->Wait();
   client->AsyncSendBatchBarrier(ep);
   client->Wait();
diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h
index 123d3598f4f4753f70889e415aff0f41b7d212f7..2ce9b31bb81de867ff4ed6ee14afddecd95317b9 100644
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
 
-#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG)
+#if defined(__CUDA_ARCH__)
 #include <stdio.h>
 #define PADDLE_ASSERT(e)                                           \
   do {                                                             \
@@ -38,6 +38,9 @@ limitations under the License. */
   } while (0)
 #else
 #include <assert.h>
-#define PADDLE_ASSERT(e) assert(e)
+// For cuda, the assertions can affect performance and it is therefore
+// recommended to disable them in production code
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
+#define PADDLE_ASSERT(e) assert((e))
 #define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
 #endif
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 0f4a7c8485b21e36dac46c5a87c2445275a3195e..6ea4f8b7cba18ce7f803dbd9b15a7ae70c3055f2 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -81,6 +81,27 @@ enum class PoolingMode {
   kMaximumDeterministic,
 };
 
+#if CUDNN_VERSION < 6000
+#pragma message "CUDNN version under 6.0 is supported at best effort."
+#pragma message "We strongly encourage you to move to 6.0 and above."
+#pragma message "This message is intended to annoy you enough to update."
+#pragma message \
+    "please see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/"
+
+inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
+  switch (mode) {
+    case PoolingMode::kMaximumDeterministic:
+      return CUDNN_POOLING_MAX;
+    case PoolingMode::kAverage:
+      return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    case PoolingMode::kMaximum:
+      return CUDNN_POOLING_MAX;
+    default:
+      PADDLE_THROW("Unexpected pooling mode.");
+  }
+}
+#else
+
 inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
   switch (mode) {
     case PoolingMode::kMaximumDeterministic:
@@ -93,6 +114,7 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
       PADDLE_THROW("Unexpected pooling mode.");
   }
 }
+#endif  // CUDNN_VERSION < 6000
 
 template <typename T>
 class CudnnDataType;
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index f1187620d81ff3bc1deef2106edb54d6199fa927..de711b7d23ef01d57a62087c552ea090f01f0386 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <mkldnn.h>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace platform {
@@ -86,5 +87,17 @@ inline mkldnn::memory::data_type MKLDNNGetDataType<float>() {
   return mkldnn::memory::f32;
 }
 
+inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) {
+  auto reorder_prim = mkldnn::reorder(src, dst);
+  std::vector<mkldnn::primitive> pipeline;
+  pipeline.push_back(reorder_prim);
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+
+inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) {
+  return static_cast<mkldnn::memory::format>(
+      memory.get_primitive_desc().desc().data.format);
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 03cf417b62f96fd6812b3eac497ffdf9a484f5eb..669d1bdaa3ec194be817cdc5e1f8484770c70c68 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -553,6 +553,12 @@ All parameter, weight, gradient are variables in Paddle.
           [](BuildStrategy &self,
              BuildStrategy::GradientScaleStrategy strategy) {
             self.gradient_scale_ = strategy;
+          })
+      .def_property(
+          "debug_graphviz_path",
+          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
+          [](BuildStrategy &self, const std::string &path) {
+            self.debug_graphviz_path_ = path;
           });
 
   pe.def(py::init<const std::vector<platform::Place> &,
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 113d02ce4865877d9385da31d996c0985c348716..55959197e7cd82253fb0c604604b4302ca0a3dc7 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -447,7 +447,7 @@ EOF
     # run paddle version to install python packages first
     RUN apt-get update &&\
         ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \
+        apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 93aa5f908ec929a33089a62caa2186ba9e57fffe..33d8f709412b25d29c6618272500dd7b953d6645 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -170,6 +170,8 @@ def get_program_cache_key(feed, fetch_list):
             return var.desc.name()
         elif isinstance(var, str):
             return var
+        elif isinstance(var, basestring):
+            return str(var)
         else:
             raise TypeError(str(var) + " should be Variable or str")
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 33b5caa0eab0ec192eb4a3b63cf82a672c58d2cb..bbd35aaecba27ea9fd66b9be585a972690980ab8 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -72,6 +72,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
         return core.VarDesc.VarType.INT64
     elif dtype == np.bool:
         return core.VarDesc.VarType.BOOL
+    elif dtype == np.uint16:
+        return core.VarDesc.VarType.INT16
     elif dtype == np.uint8:
         return core.VarDesc.VarType.UINT8
     else:
@@ -361,6 +363,13 @@ class OpProtoHolder(object):
             raise ValueError("Operator \"%s\" has not been registered." % type)
         return self.op_proto_map[type]
 
+    @staticmethod
+    def generated_op_attr_names():
+        return {
+            core.op_proto_and_checker_maker.kOpRoleAttrName(),
+            core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+        }
+
 
 class Operator(object):
     """
@@ -368,6 +377,13 @@ class Operator(object):
     Block. Users can use the build in instructions to describe their neural
     network.
     """
+    OP_WITHOUT_KERNEL_SET = {
+        'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
+        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
+        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
+        'ncclInit', 'channel_create', 'channel_close', 'channel_send',
+        'channel_recv', 'select'
+    }
 
     def __init__(self,
                  block,
@@ -504,17 +520,13 @@ class Operator(object):
                 else:
                     self.desc.set_attr(attr_name, self.attrs[attr_name])
         self.desc.check_attrs()
-        no_kernel_op_set = {
-            'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
-            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
-            'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
-            'load_combine', 'ncclInit', 'channel_create', 'channel_close',
-            'channel_send', 'channel_recv', 'select', 'gen_nccl_id'
-        }
-        if type not in no_kernel_op_set:
+        if self.has_kernel(type):
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
 
+    def has_kernel(self, op_type):
+        return op_type not in self.OP_WITHOUT_KERNEL_SET
+
     def to_string(self, throw_on_error):
         """
         To debug string.
@@ -742,7 +754,9 @@ class Block(object):
 
     def var(self, name):
         if not isinstance(name, basestring):
-            raise TypeError()
+            raise TypeError(
+                "var require string as parameter, but get %s instead." %
+                (type(name)))
         v = self.vars.get(name, None)
         if v is None:
             raise ValueError("var %s not in this block" % name)
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index 9f242cf29a56573349f192307a68e135a409a4be..6baac00905713594acd59bb3819038576fab0674 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -56,6 +56,8 @@ class Inferencer(object):
         else:
             self.exe = executor.Executor(self.place)
 
+        self.inference_program = self.inference_program.clone(for_test=True)
+
     def infer(self, inputs, return_numpy=True):
         """
         :param inputs: a map of {"input_name": input_var} that will be feed into the inference program
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 8758ac9f94ab91b5be5fc70917c64db38997d1c1..a56f3ea9db6b9fabf9d78f102d394a0817a44a98 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -434,7 +434,7 @@ def open_files(filenames,
                shapes,
                lod_levels,
                dtypes,
-               thread_num,
+               thread_num=1,
                buffer_size=None,
                pass_num=1,
                for_parallel=True):
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 295d1b7190ec39bcc6efdf72aebede14a99807aa..904413cc11b50f80d3c4730bf66ec359f9285ae6 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -15,16 +15,13 @@ import re
 import cStringIO
 import functools
 import warnings
+import string
 
 from ..proto import framework_pb2
 from ..framework import OpProtoHolder, Variable
 from ..layer_helper import LayerHelper
 
-__all__ = [
-    'deprecated',
-    'generate_layer_fn',
-    'autodoc',
-]
+__all__ = ['deprecated', 'generate_layer_fn', 'autodoc', 'templatedoc']
 
 
 def _convert_(name):
@@ -43,6 +40,10 @@ def _convert_(name):
     return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
 
 
+def _type_to_str_(tp):
+    return framework_pb2.AttrType.Name(tp)
+
+
 def _generate_doc_string_(op_proto):
     """
     Generate docstring by OpProto
@@ -54,9 +55,6 @@ def _generate_doc_string_(op_proto):
         str: the document string
     """
 
-    def _type_to_str_(tp):
-        return framework_pb2.AttrType.Name(tp)
-
     if not isinstance(op_proto, framework_pb2.OpProto):
         raise TypeError("OpProto should be `framework_pb2.OpProto`")
 
@@ -75,7 +73,11 @@ def _generate_doc_string_(op_proto):
         buf.write(str(each_input.dispensable))
         buf.write('\n')
 
+    skip_attrs = OpProtoHolder.generated_op_attr_names()
+
     for each_attr in op_proto.attrs:
+        if each_attr.name in skip_attrs:
+            continue
         buf.write('    ')
         buf.write(each_attr.name)
         buf.write(' (')
@@ -220,3 +222,49 @@ def autodoc(comment=""):
         return func
 
     return __impl__
+
+
+def templatedoc():
+    """
+    Decorator of layer function. It will use the docstring from the layer
+    function as the template. The template arguments are:
+
+    * ${comment}: The operator comment written in CPP.
+    * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput,
+        and AddInput. The ${name} is Python snake style. i.e., xxx_xxx.
+    * ${{name}_type}: The type of ${name}.
+
+    Returns:
+        Decorated function.
+    """
+
+    def __impl__(func):
+        op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
+        tmpl = string.Template(func.__doc__)
+
+        comment_lines = op_proto.comment.split("\n")
+        comment = ""
+        for line in comment_lines:
+            line = line.lstrip()
+            comment += line
+            comment += "\n"
+
+        args = {"comment": comment}
+        for each_input in op_proto.inputs:
+            input_name = _convert_(each_input.name)
+            args["{0}_comment".format(input_name)] = each_input.comment
+            args["{0}_type".format(input_name)] = "Variable"
+        for each_attr in op_proto.attrs:
+            input_name = _convert_(each_attr.name)
+            args["{0}_comment".format(input_name)] = each_attr.comment
+            args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type)
+
+        for each_opt in op_proto.outputs:
+            output_name = _convert_(each_opt.name)
+            args["{0}_comment".format(output_name)] = each_opt.comment
+            args["{0}_type".format(output_name)] = "Variable"
+
+        func.__doc__ = tmpl.substitute(args)
+        return func
+
+    return __impl__
diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
index cab2eb55510542bdd4dd7eca7667601697759181..a1c64ce2771526cbd0baa944f97d01e7878b3ac1 100644
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -64,10 +64,6 @@ def auc(input, label, curve='ROC', num_thresholds=200):
     topk_indices = helper.create_tmp_variable(dtype="int64")
     topk_out, topk_indices = nn.topk(input, k=k)
     auc_out = helper.create_tmp_variable(dtype="float32")
-    if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
-    if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
     helper.append_op(
         type="accuracy",
         inputs={
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 56f6f26803919a171f6459c909e6bb71ab63b180..ddaeb415af4320c233aa7d01130fe1da2cdcbfa8 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -19,9 +19,10 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from tensor import concat
 import utils
+import random
 
 __all__ = [
     'fc',
@@ -801,7 +802,22 @@ def gru_unit(input,
     return updated_hidden, reset_hidden_pre, gate
 
 
+@templatedoc()
 def linear_chain_crf(input, label, param_attr=None):
+    """
+    Linear Chain CRF.
+
+    ${comment}
+
+    Args:
+        input(${emission_type}): ${emission_comment}
+        label(${label_type}): ${label_comment}
+        param_attr(ParamAttr): The attribute of the learnable parameter.
+
+    Returns:
+        ${log_likelihood_comment}
+
+    """
     helper = LayerHelper('linear_chain_crf', **locals())
     size = input.shape[1]
     transition = helper.create_parameter(
@@ -827,7 +843,19 @@ def linear_chain_crf(input, label, param_attr=None):
     return log_likelihood
 
 
+@templatedoc()
 def crf_decoding(input, param_attr, label=None):
+    """
+    ${comment}
+
+    Args:
+        input(${emission_type}): ${emission_comment}
+        param_attr(ParamAttr): The parameter attribute for training.
+        label(${label_type}): ${label_comment}
+
+    Returns:
+        ${viterbi_path_comment}
+    """
     helper = LayerHelper('crf_decoding', **locals())
     transition = helper.get_parameter(param_attr.name)
     viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -1182,19 +1210,19 @@ def conv2d(input,
 
         - Input:
 
-          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
 
-          Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
 
         - Output:
-          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
 
         .. math::
 
-        H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-        W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
        input(Variable): The input image with [N, C, H, W] format.
@@ -4107,10 +4135,31 @@ def gather(input, index):
     return out
 
 
-def random_crop(input, shape, seed=1):
+@templatedoc()
+def random_crop(x, shape, seed=None):
+    """
+    ${comment}
+
+    Examples:
+        >>> img = fluid.layers.data("img", [3, 256, 256])
+        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
+
+    Args:
+        x(${x_type}): ${x_comment}
+        shape(${shape_type}): ${shape_comment}
+        seed(int|${seed_type}|None): ${seed_comment} By default, the seed will
+            get from `random.randint(-65536, 65535)`.
+
+    Returns:
+        ${out_comment}
+
+    """
     helper = LayerHelper("random_crop", **locals())
     dtype = helper.input_dtype()
     out = helper.create_tmp_variable(dtype)
+    if seed is None:
+        seed = random.randint(-65536, 65535)
+
     if isinstance(seed, int):
         seed_value = seed
         seed = helper.create_tmp_variable(dtype="int64")
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index be34cc81a5d5ca0e781e5984b6c3eeaa4e25eb90..75d3bf879703a1db1108eae45d879164e0024156 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -363,6 +363,40 @@ def zeros(shape, dtype, force_cpu=False):
     return fill_constant(value=0.0, **locals())
 
 
+def reverse(x, axis):
+    """
+    **reverse**
+
+    This function reverse the input 'x' along given axises.
+
+    Args:
+        x(Vairbale): the input to be reversed.
+        axis(int|tuple|list): Axis that along which order of elements 
+                    is reversed. If it is a tuple or a list, reversing 
+                    will be apply on each axis in the tuple or list.  
+
+    Returns:
+        Variable: The reversed tensor.
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.reverse(x=in, axis=0)
+          # or:
+          out = fluid.layers.reverse(x=in, axis=[0,1])
+    """
+    if isinstance(axis, int):
+        axis = [axis]
+    helper = LayerHelper("reverse", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='reverse',
+        inputs={'Input': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
 def save(x, file_path, overwrite=True):
     """
     Saves a variable as a file.
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
index b3117cf2e5e0513089e5e1146d49702fcc8b7ba6..ad28c9eff560507e5b326451159be3949353f58f 100644
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -38,7 +38,7 @@ def inference_program():
     return y_predict
 
 
-def linear():
+def train_program():
     y = fluid.layers.data(name='y', shape=[1], dtype='float32')
     y_predict = inference_program()
 
@@ -104,7 +104,7 @@ def main(use_cuda):
     # Directory for saving the trained model
     params_dirname = "fit_a_line.inference.model"
 
-    train(use_cuda, linear, params_dirname)
+    train(use_cuda, train_program, params_dirname)
     infer(use_cuda, inference_program, params_dirname)
 
 
diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e891ee932f1440001eb25b222f1f4613e97dfcb1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import time
+import itertools
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from op_test import OpTest
+
+
+class BenchmarkSuite(OpTest):
+    def timeit_function(self, callback, iters, *args, **kwargs):
+        assert iters != 0, "Iters should >= 1"
+        start = time.time()
+        for i in range(iters):
+            callback(*args, **kwargs)
+        elapse = time.time() - start
+        return elapse / iters
+
+    def _assert_cpu_gpu_same(self, cpu_outs, gpu_outs, fetch_list, atol):
+        for item_cpu_out, item_gpu_out, variable in zip(cpu_outs, gpu_outs,
+                                                        fetch_list):
+            # the cpu version is baseline, expect gpu version keep same with cpu version.
+            expect = item_cpu_out
+            expect_t = np.array(item_cpu_out)
+            actual = item_gpu_out
+            actual_t = np.array(item_gpu_out)
+            var_name = variable if isinstance(variable,
+                                              basestring) else variable.name
+            self.assertTrue(
+                np.allclose(
+                    actual_t, expect_t, atol=atol),
+                "Output (" + var_name + ") has diff" + str(actual_t) + "\n" +
+                str(expect_t))
+            self.assertListEqual(actual.lod(),
+                                 expect.lod(),
+                                 "Output (" + var_name + ") has different lod")
+
+    def _get_input_names(self):
+        inputs = []
+        for name, value in self.inputs.iteritems():
+            if isinstance(value, list):
+                inputs.extend([sub_name for sub_name, _ in value])
+            inputs.append(name)
+        return inputs
+
+    def _get_output_names(self):
+        outputs = []
+        for var_name, var in self.outputs.iteritems():
+            if isinstance(var, list):
+                for sub_var_name, sub_var in var:
+                    outputs.append(sub_var_name)
+            else:
+                outputs.append(var_name)
+        if len(outputs) == 0:
+            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+                outputs.append(str(out_name))
+        return outputs
+
+    def check_output_stability(self, atol=1e-8):
+        places = self._get_places()
+        if len(places) < 2:
+            return
+        cpu_outs, fetch_list = self._calc_output(places[0])
+        gpu_outs, _ = self._calc_output(places[1])
+        self._assert_cpu_gpu_same(cpu_outs, gpu_outs, fetch_list, atol)
+
+    def timeit_output_with_place(self, place, iters):
+        return self.timeit_function(self.calc_output, iters, place)
+
+    def timeit_output(self, iters=100):
+        places = self._get_places()
+        elapses = []
+        for place in places:
+            elapses.append(self.timeit_output_with_place(place, iters))
+        for place, elapse in zip(places, elapses):
+            print("One pass of ({2}_op) at {0} cost {1}".format(
+                str(place), elapse, self.op_type))
+
+    def timeit_grad_with_place(self, place, iters=100):
+        inputs_to_check = self._get_input_names()
+        output_names = self._get_output_names()
+        return self.timeit_function(
+            self._get_gradient,
+            iters,
+            inputs_to_check,
+            place,
+            output_names,
+            no_grad_set=None)
+
+    def timeit_grad(self, iters=100):
+        places = self._get_places()
+        elapses = []
+        for place in places:
+            elapses.append(self.timeit_grad_with_place(place, iters))
+        for place, elapse in zip(places, elapses):
+            print("One pass of ({2}_grad_op) at {0} cost {1}".format(
+                str(place), elapse, self.op_type))
diff --git a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..91a5f1bca4441d80489a02eb9283928e38321826
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+from benchmark import BenchmarkSuite
+from op_test import OpTest
+
+# This is a demo op test case for operator benchmarking and high resolution number stability alignment.
+
+
+class TestSumOp(BenchmarkSuite):
+    def setUp(self):
+        self.op_type = "sum"
+        self.customize_testcase()
+        self.customize_fetch_list()
+
+    def customize_fetch_list(self):
+        """
+        customize fetch list, configure the wanted variables.
+        >>> self.fetch_list = ["Out"]
+        """
+        self.fetch_list = ["Out"]
+        # pass
+
+    def customize_testcase(self):
+        # a test case
+        x0 = np.random.random((300, 400)).astype('float32')
+        x1 = np.random.random((300, 400)).astype('float32')
+        x2 = np.random.random((300, 400)).astype('float32')
+
+        # NOTE: if the output is empty, then it will autofilled by benchmarkSuite.
+        # only the output dtype is used, the shape, lod and data is computed from input.
+        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+        self.outputs = {"Out": x0 + x1 + x2}
+
+    def test_check_output(self):
+        """
+        compare the output with customized output. In this case,
+        you should set the correct output by hands.
+        >>> self.outputs = {"Out": x0 + x1 + x2}
+        """
+        self.check_output(atol=1e-8)
+
+    def test_output_stability(self):
+        # compare the cpu gpu output in high resolution.
+        self.check_output_stability()
+
+    def test_timeit_output(self):
+        """
+        perf the op, time cost will be averged in iters.
+        output example
+        >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
+        >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
+        """
+        self.timeit_output(iters=100)
+
+    def test_timeit_grad(self):
+        """
+        perf the op gradient, time cost will be averged in iters.
+        output example
+        >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
+        >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653
+        """
+        self.timeit_grad(iters=100)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index b611470fa1ff326df960c349b71006f52d586d8e..307caae4b0cf4869c1abb755215aa97795d47e15 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -15,13 +15,17 @@
 import unittest
 import numpy as np
 import random
+import time
 import itertools
-import paddle.fluid.core as core
 import collections
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid.backward import append_backward
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from paddle.fluid.framework import Program, OpProtoHolder
+from paddle.fluid.framework import Program, OpProtoHolder, Variable
+from testsuite import create_op, set_input, append_input_output, append_loss_ops
 
 
 def randomize_probability(batch_size, class_num, dtype='float32'):
@@ -33,73 +37,6 @@ def randomize_probability(batch_size, class_num, dtype='float32'):
     return prob
 
 
-def create_op(scope, op_type, inputs, outputs, attrs):
-    kwargs = dict()
-
-    op_maker = core.op_proto_and_checker_maker
-    op_role_attr_name = op_maker.kOpRoleAttrName()
-
-    if op_role_attr_name not in attrs:
-        attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
-
-    def __create_var__(name, var_name):
-        scope.var(var_name).get_tensor()
-        kwargs[name].append(var_name)
-
-    for in_name, in_dup in Operator.get_op_inputs(op_type):
-        if in_name in inputs:
-            kwargs[in_name] = []
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, _ = item[0], item[1]
-                    __create_var__(in_name, sub_in_name)
-            else:
-                __create_var__(in_name, in_name)
-
-    for out_name, out_dup in Operator.get_op_outputs(op_type):
-        if out_name in outputs:
-            kwargs[out_name] = []
-            if out_dup:
-                sub_out = outputs[out_name]
-                for item in sub_out:
-                    sub_out_name, _ = item[0], item[1]
-                    __create_var__(out_name, sub_out_name)
-            else:
-                __create_var__(out_name, out_name)
-
-    for attr_name in Operator.get_op_attr_names(op_type):
-        if attr_name in attrs:
-            kwargs[attr_name] = attrs[attr_name]
-
-    return Operator(op_type, **kwargs)
-
-
-def set_input(scope, op, inputs, place):
-    def __set_input__(var_name, var):
-        if isinstance(var, tuple) or isinstance(var, np.ndarray):
-            tensor = scope.find_var(var_name).get_tensor()
-            if isinstance(var, tuple):
-                tensor.set_lod(var[1])
-                var = var[0]
-            tensor.set_dims(var.shape)
-            tensor.set(var, place)
-        elif isinstance(var, float):
-            scope.find_var(var_name).set_float(var)
-        elif isinstance(var, int):
-            scope.find_var(var_name).set_int(var)
-
-    for in_name, in_dup in Operator.get_op_inputs(op.type()):
-        if in_name in inputs:
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, sub_in_val = item[0], item[1]
-                    __set_input__(sub_in_name, sub_in_val)
-            else:
-                __set_input__(in_name, inputs[in_name])
-
-
 def get_numeric_gradient(place,
                          scope,
                          op,
@@ -173,54 +110,15 @@ def get_numeric_gradient(place,
     return gradient_flat.reshape(tensor_to_check.get_dims())
 
 
-def append_input_output(block, op_proto, np_list, is_input):
-    '''Insert VarDesc and generate Python variable instance'''
-    proto_list = op_proto.inputs if is_input else op_proto.outputs
-
-    def create_var(block, name, np_list, var_proto):
-        if name not in np_list:
-            assert var_proto.intermediate, "{} not found".format(name)
-            shape = None
-            lod_level = None
-        else:
-            np_value = np_list[name]
-            if isinstance(np_value, tuple):
-                shape = list(np_value[0].shape)
-                lod_level = len(np_value[1])
-            else:
-                shape = list(np_value.shape)
-                lod_level = 0
-        return block.create_var(
-            dtype="float32", shape=shape, lod_level=lod_level, name=name)
-
-    var_dict = {}
-    for var_proto in proto_list:
-        var_name = str(var_proto.name)
-        if is_input:
-            if (var_name not in np_list) and var_proto.dispensable:
-                continue
-            assert (var_name in np_list) or (var_proto.dispensable), \
-                "Missing {} as input".format(var_name)
-        if var_proto.duplicable:
-            assert isinstance(np_list[var_name], list), \
-                "Duplicable {} should be set as list".format(var_name)
-            var_list = []
-            for (name, np_value) in np_list[var_name]:
-                var_list.append(
-                    create_var(block, name, {name: np_value}, var_proto))
-            var_dict[var_name] = var_list
-        else:
-            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
-
-    return var_dict
-
-
 class OpTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         '''Fix random seeds to remove randomness from tests'''
         cls._np_rand_state = np.random.get_state()
         cls._py_rand_state = random.getstate()
+        cls.call_once = False
+        cls.dtype = "float32"
+        cls.outputs = {}
 
         np.random.seed(123)
         random.seed(124)
@@ -231,6 +129,31 @@ class OpTest(unittest.TestCase):
         np.random.set_state(cls._np_rand_state)
         random.setstate(cls._py_rand_state)
 
+    def try_call_once(self, data_type):
+        if not self.call_once:
+            self.call_once = True
+            self.dtype = data_type
+
+    def infer_dtype_from_inputs_outputs(self, inputs, outputs):
+        def infer_dtype(numpy_dict):
+            assert isinstance(
+                numpy_dict,
+                dict), "self.inputs, self.outputs must be numpy_dict"
+            for var_name, var_value in numpy_dict.iteritems():
+                if isinstance(var_value, (np.ndarray, np.generic)):
+                    self.try_call_once(var_value.dtype)
+                elif isinstance(var_value, (list, tuple)):
+                    # the case of self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+                    if len(var_value) > 1 and isinstance(var_value[1], (
+                            np.ndarray, np.generic)):
+                        instance = var_value[1]
+                        self.try_call_once(instance[1].dtype)
+                else:
+                    self.try_call_once("float32")
+
+        infer_dtype(inputs)
+        infer_dtype(outputs)
+
     def feed_var(self, input_vars, place):
         feed_map = {}
         for var_name in input_vars:
@@ -254,18 +177,14 @@ class OpTest(unittest.TestCase):
 
         return feed_map
 
-    def calc_output(self, place):
-        outs, _ = self._calc_output(place)
-        return outs
-
-    def _calc_output(self, place):
+    def _append_ops(self, block):
         op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
-
-        program = Program()
-        block = program.global_block()
-
-        inputs = append_input_output(block, op_proto, self.inputs, True)
-        outputs = append_input_output(block, op_proto, self.outputs, False)
+        "infer datatype from inputs and outputs for this test case"
+        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
+        inputs = append_input_output(block, op_proto, self.inputs, True,
+                                     self.dtype)
+        outputs = append_input_output(block, op_proto, self.outputs, False,
+                                      self.dtype)
         op = block.append_op(
             type=self.op_type,
             inputs=inputs,
@@ -275,22 +194,68 @@ class OpTest(unittest.TestCase):
         op.desc.infer_var_type(block.desc)
         op.desc.infer_shape(block.desc)
 
-        fetch_list = []
-        for var_name, var in outputs.iteritems():
-            if var_name in self.outputs:
+    def _get_io_vars(self, block, numpy_inputs):
+        inputs = {}
+        for name, value in numpy_inputs.iteritems():
+            if isinstance(value, list):
+                var_list = [
+                    block.var(sub_name) for sub_name, sub_value in value
+                ]
+                inputs[name] = var_list
+            else:
+                inputs[name] = block.var(name)
+        return inputs
+
+    def _get_inputs(self, block):
+        return self._get_io_vars(block, self.inputs)
+
+    def _get_outputs(self, block):
+        return self._get_io_vars(block, self.outputs)
+
+    def calc_output(self, place):
+        outs, _ = self._calc_output(place)
+        return outs
+
+    def _calc_output(self, place, parallel=False):
+
+        program = Program()
+        block = program.global_block()
+        self._append_ops(block)
+
+        inputs = self._get_inputs(block)
+        outputs = self._get_outputs(block)
+        feed_map = self.feed_var(inputs, place)
+
+        if parallel:
+            use_cuda = False
+            if isinstance(place, fluid.CUDAPlace(0)):
+                use_cuda = True
+            executor = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+        else:
+            executor = Executor(place)
+
+        fetch_list = getattr(self, "fetch_list", [])
+        # if the fetch_list is customized by user, we use it directly.
+        # if not, fill the fetch_list by the user configured outputs in test.
+        if len(fetch_list) == 0:
+            for var_name, var in outputs.iteritems():
                 if isinstance(var, list):
                     for v in var:
                         fetch_list.append(v)
                 else:
                     fetch_list.append(var)
-
-        feed_map = self.feed_var(inputs, place)
-
-        exe = Executor(place)
-        outs = exe.run(program,
-                       feed=feed_map,
-                       fetch_list=fetch_list,
-                       return_numpy=False)
+        # if the fetch_list still empty, fill the fetch_list by the operator output.
+        if len(fetch_list) == 0:
+            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+                fetch_list.append(str(out_name))
+        # fetch_list = map(block.var, fetch_list)
+        if not isinstance(fetch_list[0], Variable):
+            fetch_list = map(block.var, fetch_list)
+        outs = executor.run(program,
+                            feed=feed_map,
+                            fetch_list=fetch_list,
+                            return_numpy=False)
         return outs, fetch_list
 
     def check_output_with_place(self, place, atol):
@@ -346,17 +311,19 @@ class OpTest(unittest.TestCase):
                                          "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
-    def check_output(self, atol=1e-5):
-        places = [core.CPUPlace()]
+    def _get_places(self):
+        places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
+        return places
+
+    def check_output(self, atol=1e-5):
+        places = self._get_places()
         for place in places:
             self.check_output_with_place(place, atol)
 
     def check_output_customized(self, checker):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
-            places.append(core.CUDAPlace(0))
+        places = self._get_places()
         for place in places:
             outs = self.calc_output(place)
             outs = [np.array(out) for out in outs]
@@ -389,9 +356,7 @@ class OpTest(unittest.TestCase):
                    in_place=False,
                    max_relative_error=0.005,
                    user_defined_grads=None):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
-            places.append(core.CUDAPlace(0))
+        places = self._get_places()
         for place in places:
             self.check_grad_with_place(place, inputs_to_check, output_names,
                                        no_grad_set, numeric_grad_delta,
@@ -438,35 +403,6 @@ class OpTest(unittest.TestCase):
                                max_relative_error,
                                "Gradient Check On %s" % str(place))
 
-    @staticmethod
-    def _create_var_descs_(block, var_dict):
-        # FIXME: Try unify with `append_input_output`
-        for param_name in var_dict:
-            var = var_dict[param_name]
-            if not isinstance(var, list) and not isinstance(var, tuple):
-                var = [(param_name, var, None)]
-            if not isinstance(var[0], list) and not isinstance(var[0], tuple):
-                var = [(param_name, var[0], var[1])]
-
-            for i, item in enumerate(var):
-                if not isinstance(item[0], basestring):
-                    item = [[param_name] + list(item)]
-                if len(item) == 2:
-                    if isinstance(item[1], tuple):
-                        var[i] = [item[0], item[1][0], item[1][1]]
-                    else:
-                        # only set var name and value, set lod to None
-                        var[i] = list(item) + [None]
-            var_descs = [(block.create_var(
-                name=name, shape=each.shape, dtype=each.dtype), each, lod)
-                         for name, each, lod in var]
-
-            yield param_name, var_descs
-
-    @staticmethod
-    def _merge_list(iterable):
-        return reduce(lambda a, b: list(a) + list(b), iterable, [])
-
     @staticmethod
     def _numpy_to_lod_tensor(np_value, lod, place):
         tensor = core.LoDTensor()
@@ -497,83 +433,31 @@ class OpTest(unittest.TestCase):
             input.dtype = np.uint16
         return input
 
-    def _get_gradient(self, input_to_check, place, output_names, no_grad_set):
+    def _get_gradient(self,
+                      input_to_check,
+                      place,
+                      output_names,
+                      no_grad_set,
+                      parallel=False):
         prog = Program()
         block = prog.global_block()
-        inputs_with_np = {
-            key: value
-            for (key, value) in OpTest._create_var_descs_(
-                block, getattr(self, 'inputs', {}))
-        }
-        outputs_with_np = {
-            key: val
-            for (key, val) in OpTest._create_var_descs_(
-                block, getattr(self, 'outputs', {}))
-        }
-        inputs = {
-            k: [item[0] for item in inputs_with_np[k]]
-            for k in inputs_with_np
-        }
-        outputs = {
-            k: [item[0] for item in outputs_with_np[k]]
-            for k in outputs_with_np
-        }
-
-        op = block.append_op(
-            type=self.op_type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=getattr(self, 'attrs', {}))
-
-        # infer variable type and infer shape in compile-time
-        op.desc.infer_var_type(block.desc)
-        op.desc.infer_shape(block.desc)
-
-        mean_inputs = map(block.var, output_names)
-
-        if len(mean_inputs) == 1:
-            loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
-            op = block.append_op(
-                inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
-            op.desc.infer_var_type(block.desc)
-            op.desc.infer_shape(block.desc)
-        else:
-            avg_sum = []
-            for cur_loss in mean_inputs:
-                cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
-                op = block.append_op(
-                    inputs={"X": [cur_loss]},
-                    outputs={"Out": [cur_avg_loss]},
-                    type="mean")
-                op.desc.infer_var_type(block.desc)
-                op.desc.infer_shape(block.desc)
-                avg_sum.append(cur_avg_loss)
-
-            loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
-            op_sum = block.append_op(
-                inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
-            op_sum.desc.infer_var_type(block.desc)
-            op_sum.desc.infer_shape(block.desc)
-
-            loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
-            op_loss = block.append_op(
-                inputs={"X": loss_sum},
-                outputs={"Out": loss},
-                type='scale',
-                attrs={'scale': 1.0 / float(len(avg_sum))})
-            op_loss.desc.infer_var_type(block.desc)
-            op_loss.desc.infer_shape(block.desc)
-
+        self._append_ops(block)
+        loss = append_loss_ops(block, output_names)
         param_grad_list = append_backward(
             loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
 
-        feed_dict = {
-            item[0].name: OpTest._numpy_to_lod_tensor(item[1], item[2], place)
-            for p_name in inputs_with_np for item in inputs_with_np[p_name]
-        }
+        inputs = self._get_inputs(block)
+        feed_dict = self.feed_var(inputs, place)
 
         fetch_list = [g for p, g in param_grad_list]
-        executor = Executor(place)
+        if parallel:
+            use_cuda = False
+            if isinstance(place, fluid.CUDAPlace(0)):
+                use_cuda = True
+            executor = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+        else:
+            executor = Executor(place)
         return map(np.array,
                    executor.run(prog, feed_dict, fetch_list,
                                 return_numpy=False))
diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py
index 20cc3a643f1adfc04faad15e1b7baad3e22d9d29..4016089c01644f0389855ab114360f90c50a1bbe 100644
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
@@ -42,9 +42,9 @@ class TestCropOp(OpTest):
     def setUp(self):
         self.op_type = "crop"
         self.crop_by_input = False
+        self.offset_by_input = False
         self.attrs = {}
         self.initTestCase()
-        self.attrs['offsets'] = self.offsets
         if self.crop_by_input:
             self.inputs = {
                 'X': np.random.random(self.x_shape).astype("float32"),
@@ -55,6 +55,10 @@ class TestCropOp(OpTest):
             self.inputs = {
                 'X': np.random.random(self.x_shape).astype("float32"),
             }
+        if self.offset_by_input:
+            self.inputs['Offsets'] = np.array(self.offsets).astype('int32')
+        else:
+            self.attrs['offsets'] = self.offsets
         self.outputs = {
             'Out': crop(self.inputs['X'], self.offsets, self.crop_shape)
         }
@@ -101,5 +105,22 @@ class TestCase4(TestCropOp):
         self.crop_by_input = True
 
 
+class TestCase5(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (3, 4, 5)
+        self.crop_shape = [2, 2, 3]
+        self.offsets = [1, 0, 2]
+        self.offset_by_input = True
+
+
+class TestCase6(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (10, 9, 14)
+        self.crop_shape = [3, 3, 5]
+        self.offsets = [3, 5, 4]
+        self.crop_by_input = True
+        self.offset_by_input = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 22329390754d8d010dced0d1aca35617140cd097..95af51f1b2f8cd9492baa9cb14fe31ffa586f2fc 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -30,9 +30,6 @@ class Memory(object):
         assert val.dtype == self.ex.dtype
         self.cur = val
 
-    def ex(self):
-        return self.ex
-
     def next(self):
         self.ex = self.cur
         self.cur = None
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 1f52bd90d0d49bda6c180019e90ebd923c91439c..96d47906a0606bba4b1d2207f7da85b058e42a2b 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -252,5 +252,25 @@ class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
         self.axis = 1
 
 
+class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 10, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
index f8ff5a3361af66612f08b2aa4eaffa363f04c594..e726f99d49877a1bc464090092ec80b97ab15d0c 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
@@ -194,107 +194,104 @@ class TestLstmOp(OpTest):
             ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
 
 
-class TestLstmOpHasInitial(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = True
-        self.is_reverse = True
-        self.use_peepholes = True
-
-    def test_check_grad(self):
-        # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
-            max_relative_error=5e-4)
-
-    def test_check_grad_ingore_bias(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Bias'))
-
-    def test_check_grad_ingore_weight(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Weight'))
-
-    def test_check_grad_ingore_input(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Weight', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Input'))
-
-    def test_check_grad_ingore_h0(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('H0'))
-
-    def test_check_grad_ingore_c0(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('C0'))
-
-
-class TestLstmOpRerverse(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = False
-        self.is_reverse = True
-        self.use_peepholes = True
-
-
-class TestLstmOpNotUsePeepholes(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = False
-        self.is_reverse = True
-        self.use_peepholes = False
-
+# class TestLstmOpHasInitial(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[0, 2, 5, 7]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = True
+#         self.is_reverse = True
+#         self.use_peepholes = True
+
+#     def test_check_grad(self):
+#         # TODO(qingqing) remove folowing lines after the check_grad is refined.
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
+#             max_relative_error=5e-4)
+
+#     def test_check_grad_ingore_bias(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Bias'))
+
+#     def test_check_grad_ingore_weight(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Bias'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Weight'))
+
+#     def test_check_grad_ingore_input(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Weight', 'Bias'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Input'))
+
+#     def test_check_grad_ingore_h0(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('H0'))
+
+#     def test_check_grad_ingore_c0(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('C0'))
+
+# class TestLstmOpRerverse(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[0, 2, 5, 7]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = False
+#         self.is_reverse = True
+#         self.use_peepholes = True
+
+# class TestLstmOpNotUsePeepholes(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[0, 2, 5, 7]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = False
+#         self.is_reverse = True
+#         self.use_peepholes = False
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index 862b7f8cb93620da4dd4673028776cfe565eeb0b..bbc782c1bce302df68ab30013f3a7667e51ed479 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -22,8 +22,8 @@ class TestMulOp(OpTest):
     def setUp(self):
         self.op_type = "mul"
         self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
+            'X': np.random.random((2, 5)).astype("float32"),
+            'Y': np.random.random((5, 3)).astype("float32")
         }
         self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
 
@@ -46,13 +46,16 @@ class TestMulOp2(OpTest):
     def setUp(self):
         self.op_type = "mul"
         self.inputs = {
-            'X': np.random.random((15, 4, 12, 10)).astype("float32"),
-            'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32")
+            'X': np.random.random((3, 4, 4, 3)).astype("float32"),
+            'Y': np.random.random((2, 6, 1, 2, 3)).astype("float32")
         }
-        self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2}
-        result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10),
-                        self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9))
-        result = result.reshape(15, 4, 8, 2, 9)
+        self.attrs = {
+            'x_num_col_dims': 2,
+            'y_num_col_dims': 2,
+        }
+        result = np.dot(self.inputs['X'].reshape(3 * 4, 4 * 3),
+                        self.inputs['Y'].reshape(2 * 6, 1 * 2 * 3))
+        result = result.reshape(3, 4, 1, 2, 3)
         self.outputs = {'Out': result}
 
     def test_check_output(self):
@@ -73,9 +76,9 @@ class TestMulOp2(OpTest):
 class TestFP16MulOp1(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        x = np.random.random((32, 84)).astype("float16")
-        y = np.random.random((84, 100)).astype("float16")
-        self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
+        x = np.random.random((3, 5)).astype("float16")
+        y = np.random.random((5, 4)).astype("float16")
+        self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)}
         self.outputs = {'Out': np.dot(x, y)}
 
     def test_check_output(self):
@@ -88,13 +91,15 @@ class TestFP16MulOp1(OpTest):
 class TestFP16MulOp2(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        x = np.random.random((15, 4, 12, 10)).astype("float16")
-        y = np.random.random((4, 30, 8, 2, 9)).astype("float16")
-        self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
-        self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2}
-        result = np.dot(
-            x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9))
-        result = result.reshape(15, 4, 8, 2, 9)
+        x = np.random.random((3, 4, 4, 3)).astype("float16")
+        y = np.random.random((2, 6, 1, 2, 3)).astype("float16")
+        self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)}
+        self.attrs = {
+            'x_num_col_dims': 2,
+            'y_num_col_dims': 2,
+        }
+        result = np.dot(x.reshape(3 * 4, 4 * 3), y.reshape(2 * 6, 1 * 2 * 3))
+        result = result.reshape(3, 4, 1, 2, 3)
         self.outputs = {'Out': result}
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f845575a02869f08299d76b5600074598ca27f6c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestReverseOp(OpTest):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [0]
+
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = "reverse"
+        self.inputs = {"X": self.x}
+        self.attrs = {'axis': self.axis}
+        out = self.x
+        for a in self.axis:
+            out = np.flip(out, axis=a)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestCase0(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [1]
+
+
+class TestCase1(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [0, 1]
+
+
+class TestCase2(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4, 5)).astype('float32')
+        self.axis = [0, 2]
+
+
+class TestCase3(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4, 5)).astype('float32')
+        self.axis = [1, 2]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dc94a80c9d3999d34fdf0edbf82ffe297bd95d7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -0,0 +1,182 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+def as_lodtensor(np_array, lod, place):
+    tensor = core.LoDTensor()
+    tensor.set(np_value, place)
+    if lod is not None:
+        tensor.set_lod(lod)
+    return tensor
+
+
+def create_op(scope, op_type, inputs, outputs, attrs):
+    kwargs = dict()
+
+    op_maker = core.op_proto_and_checker_maker
+    op_role_attr_name = op_maker.kOpRoleAttrName()
+
+    if op_role_attr_name not in attrs:
+        attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
+
+    def __create_var__(name, var_name):
+        scope.var(var_name).get_tensor()
+        kwargs[name].append(var_name)
+
+    for in_name, in_dup in Operator.get_op_inputs(op_type):
+        if in_name in inputs:
+            kwargs[in_name] = []
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, _ = item[0], item[1]
+                    __create_var__(in_name, sub_in_name)
+            else:
+                __create_var__(in_name, in_name)
+
+    for out_name, out_dup in Operator.get_op_outputs(op_type):
+        if out_name in outputs:
+            kwargs[out_name] = []
+            if out_dup:
+                sub_out = outputs[out_name]
+                for item in sub_out:
+                    sub_out_name, _ = item[0], item[1]
+                    __create_var__(out_name, sub_out_name)
+            else:
+                __create_var__(out_name, out_name)
+
+    for attr_name in Operator.get_op_attr_names(op_type):
+        if attr_name in attrs:
+            kwargs[attr_name] = attrs[attr_name]
+
+    return Operator(op_type, **kwargs)
+
+
+def set_input(scope, op, inputs, place):
+    def __set_input__(var_name, var):
+        if isinstance(var, tuple) or isinstance(var, np.ndarray):
+            tensor = scope.find_var(var_name).get_tensor()
+            if isinstance(var, tuple):
+                tensor.set_lod(var[1])
+                var = var[0]
+            tensor.set_dims(var.shape)
+            tensor.set(var, place)
+        elif isinstance(var, float):
+            scope.find_var(var_name).set_float(var)
+        elif isinstance(var, int):
+            scope.find_var(var_name).set_int(var)
+
+    for in_name, in_dup in Operator.get_op_inputs(op.type()):
+        if in_name in inputs:
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, sub_in_val = item[0], item[1]
+                    __set_input__(sub_in_name, sub_in_val)
+            else:
+                __set_input__(in_name, inputs[in_name])
+
+
+def append_input_output(block, op_proto, np_list, is_input, dtype):
+    '''Insert VarDesc and generate Python variable instance'''
+    proto_list = op_proto.inputs if is_input else op_proto.outputs
+
+    def create_var(block, name, np_list, var_proto):
+        dtype = None
+        shape = None
+        lod_level = None
+        if name not in np_list:
+            assert var_proto.intermediate, "{} not found".format(name)
+        else:
+            np_value = np_list[name]
+            if isinstance(np_value, tuple):
+                dtype = np_value[0].dtype
+                # output shape, lod should be infered from input.
+                if is_input:
+                    shape = list(np_value[0].shape)
+                    lod_level = len(np_value[1])
+            else:
+                dtype = np_value.dtype
+                if is_input:
+                    shape = list(np_value.shape)
+                    lod_level = 0
+        return block.create_var(
+            dtype=dtype, shape=shape, lod_level=lod_level, name=name)
+
+    var_dict = {}
+    for var_proto in proto_list:
+        var_name = str(var_proto.name)
+        if is_input:
+            if (var_name not in np_list) and var_proto.dispensable:
+                continue
+            assert (var_name in np_list) or (var_proto.dispensable), \
+                "Missing {} as input".format(var_name)
+        if var_proto.duplicable:
+            assert isinstance(np_list[var_name], list), \
+                "Duplicable {} should be set as list".format(var_name)
+            var_list = []
+            for (name, np_value) in np_list[var_name]:
+                var_list.append(
+                    create_var(block, name, {name: np_value}, var_proto))
+            var_dict[var_name] = var_list
+        else:
+            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
+
+    return var_dict
+
+
+def append_loss_ops(block, output_names):
+    mean_inputs = map(block.var, output_names)
+    # for item in mean_inputs:
+    #     print(item)
+    #     print("Item", item.dtype)
+
+    if len(mean_inputs) == 1:
+        loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
+        op = block.append_op(
+            inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+    else:
+        avg_sum = []
+        for cur_loss in mean_inputs:
+            cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
+            op = block.append_op(
+                inputs={"X": [cur_loss]},
+                outputs={"Out": [cur_avg_loss]},
+                type="mean")
+            op.desc.infer_var_type(block.desc)
+            op.desc.infer_shape(block.desc)
+            avg_sum.append(cur_avg_loss)
+
+        loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
+        op_sum = block.append_op(
+            inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
+        op_sum.desc.infer_var_type(block.desc)
+        op_sum.desc.infer_shape(block.desc)
+
+        loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
+        op_loss = block.append_op(
+            inputs={"X": loss_sum},
+            outputs={"Out": loss},
+            type='scale',
+            attrs={'scale': 1.0 / float(len(avg_sum))})
+        op_loss.desc.infer_var_type(block.desc)
+        op_loss.desc.infer_shape(block.desc)
+    return loss
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 2737f1c70d6d3b2927957a3b60d667237afaa9aa..efc28d899304b01a3085891f3ae9396d57c589a1 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -152,9 +152,9 @@ class Trainer(object):
             program_func_outs = train_func()
             self.train_func_outputs = program_func_outs if isinstance(
                 program_func_outs, list) else [program_func_outs]
-            self.test_program = self.train_program.clone()
+            self.test_program = self.train_program.clone(for_test=True)
 
-            # The fisrt element of program_func_outs is loss.
+            # The first element of program_func_outs is loss.
             loss = self.train_func_outputs[0]
 
             optimizer = optimizer_func()