Merge branch 'develop' of github.com:PaddlePaddle/Paddle into overlap_memcpy_with_dist

d5a88b93 · Yancey1989 · 23433def · 0aa9546e · d5a88b93 · d5a88b93
86 changed file
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -19,4 +19,4 @@ ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py dataset.py models/ /workspace/
+ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -44,6 +44,16 @@ Currently supported `--model` argument include:
    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
    ```
+## Prepare the RecordIO file to Achieve Better Performance
+Run the following command will generate RecordIO files like "mnist.recordio" under the path
+and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
+at any time using `fluid.batch`.
+```bash
+python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
+```
 ## Run Distributed Benchmark on Kubernetes Cluster
 You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will

--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -38,10 +38,12 @@ def parse_args():
        default='resnet',
        help='The model to run benchmark with.')
    parser.add_argument(
-        '--batch_size', type=int, default=32, help='The minibatch size.')
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The batch size on each gpu.')
    parser.add_argument(
        '--learning_rate', type=float, default=0.001, help='The learning rate.')
-    # TODO(wuyi): add "--use_fake_data" option back.
    parser.add_argument(
        '--skip_batch_num',
        type=int,
@@ -49,7 +51,10 @@ def parse_args():
        help='The first num of minibatch num to skip, for better performance test'
    )
    parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
+        '--iterations',
+        type=int,
+        default=80,
+        help='The number of minibatches, set to -1 to run all batches.')
    parser.add_argument(
        '--pass_num', type=int, default=100, help='The number of passes.')
    parser.add_argument(
@@ -69,6 +74,7 @@ def parse_args():
        type=int,
        default=1,
        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    # this option is available only for vgg and resnet.
    parser.add_argument(
        '--cpus',
        type=int,
@@ -78,7 +84,7 @@ def parse_args():
        '--data_set',
        type=str,
        default='flowers',
-        choices=['cifar10', 'flowers'],
+        choices=['cifar10', 'flowers', 'imagenet'],
        help='Optional dataset for benchmark.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
@@ -108,6 +114,16 @@ def parse_args():
        default='local',
        choices=['local', 'pserver', 'nccl2'],
        help='Choose parameter update method, can be local, pserver, nccl2.')
+    parser.add_argument(
+        '--use_reader_op',
+        action='store_true',
+        help='Whether to use reader op, and must specify the data path if set this to true.'
+    )
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the training recordio files.')
    args = parser.parse_args()
    return args
@@ -210,6 +226,8 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(startup_prog)
+    if not args.use_reader_op:
        feed_var_list = [
            var for var in train_prog.global_block().vars.itervalues()
            if var.is_data
@@ -219,16 +237,38 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
+        if not args.use_reader_op:
+            reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+                if data == None:
+                    break
+            if iters == args.iterations:
+                break
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
-            if iters == args.iterations:
+            if args.use_reader_op:
+                try:
+                    loss = exe.run(train_prog, fetch_list=[avg_loss])
+                except fluid.core.EnforceNotMet as ex:
                    break
+            else:
                loss = exe.run(train_prog,
                               feed=feeder.feed(data),
                               fetch_list=[avg_loss])
            iters += 1
+            batch_id += 1
+            # FIXME(wuyi): For use_reader_op, if the current
+            # pass is not the last, the last batch of this pass
+            # is also equal to args.batch_size.
+            if args.use_reader_op:
+                num_samples += args.batch_size * args.gpus
+            else:
                num_samples += len(data)
            train_losses.append(loss)
            print("Pass: %d, Iter: %d, Loss: %f\n" %
@@ -250,10 +290,14 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
 def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
                   num_trainers, trainer_id):
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    if not args.use_reader_op:
        feed_var_list = [
            var for var in train_prog.global_block().vars.itervalues()
            if var.is_data
        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
@@ -270,7 +314,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                       "value": 1.0,
                       "dtype": var.dtype})
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    if nccl_id_var and trainer_id == 0:
        #FIXME(wuyi): wait other trainer to start listening
        time.sleep(30)
@@ -287,12 +330,21 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
        num_trainers=num_trainers,
        trainer_id=trainer_id)
-    feeder = fluid.DataFeeder(feed_var_list, place)
    for pass_id in range(args.pass_num):
        num_samples = 0
        iters = 0
        start_time = time.time()
-        for batch_id, data in enumerate(train_reader()):
+        if not args.use_reader_op:
+            reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+                if data == None:
+                    break
+            if iters == args.iterations:
+                break
            if args.profile and pass_id == 0 and batch_id == 5:
                profiler.start_profiler("All")
            elif args.profile and pass_id == 0 and batch_id == 10:
@@ -301,19 +353,25 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
-            if iters == args.iterations:
+            if args.use_fake_data or args.use_reader_op:
-                break
+                try:
-            if args.use_fake_data:
                    loss, = exe.run([avg_loss.name])
+                except fluid.core.EnforceNotMet as ex:
+                    break
            else:
                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
            if args.update_method == "pserver":
                exe.bcast_params()
+            if args.use_reader_op:
+                num_samples += args.batch_size * args.gpus
+            else:
                num_samples += len(data)
            iters += 1
            if batch_id % 1 == 0:
                print("Pass %d, batch %d, loss %s" %
                      (pass_id, batch_id, np.array(loss)))
+            batch_id += 1
        print_train_time(start_time, time.time(), num_samples)
        if not args.no_test and batch_acc:
            test_acc = test(startup_exe, infer_prog, test_reader, feeder,

--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -197,6 +197,8 @@ def lodtensor_to_ndarray(lod_tensor):
 def get_model(args):
+    if args.use_reader_op:
+        raise Exception("machine_translation do not support reader op for now.")
    embedding_dim = 512
    encoder_size = 512
    decoder_size = 512
@@ -221,7 +223,7 @@ def get_model(args):
    train_batch_generator = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
    test_batch_generator = paddle.batch(
        paddle.reader.shuffle(

--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -20,6 +20,7 @@ import numpy as np
 import argparse
 import time
 import cProfile
+import os
 import paddle
 import paddle.fluid as fluid
@@ -65,7 +66,22 @@ def cnn_model(data):
 def get_model(args):
-    # Input data
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1, 1, 28, 28], (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@@ -103,7 +119,7 @@ def get_model(args):
    # Reader
    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=args.batch_size)
    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import functools
 import numpy as np
 import time
+import os
 import cProfile, pstats, StringIO
@@ -26,6 +27,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
+from recordio_converter import imagenet_train, imagenet_test
 def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
@@ -122,14 +124,46 @@ def get_model(args):
        else:
            dshape = [32, 32, 3]
        model = resnet_cifar10
-    else:
+        train_reader = paddle.dataset.cifar.train10()
+        test_reader = paddle.dataset.cifar.test10()
+    elif args.data_set == "flowers":
        class_dim = 102
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]
        model = resnet_imagenet
+        train_reader = paddle.dataset.flowers.train()
+        test_reader = paddle.dataset.flowers.test()
+    elif args.data_set == "imagenet":
+        class_dim = 1000
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if not args.data_path:
+            raise Exception(
+                "Must specify --data_path when training with imagenet")
+        train_reader = imagenet_train(args.data_path)
+        test_reader = imagenet_test(args.data_path)
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + dshape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        input, label = fluid.layers.read_file(data_file)
+    else:
        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@@ -162,15 +196,10 @@ def get_model(args):
    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-    train_reader = paddle.batch(
+    batched_train_reader = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
+            train_reader, buf_size=5120),
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+        batch_size=args.batch_size * args.gpus)
-            buf_size=5120),
+    batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size)
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
+    return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=args.batch_size)
-    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size):
 def get_model(args):
+    if args.use_reader_op:
+        raise Exception(
+            "stacked_dynamic_lstm do not support reader op for now.")
    lstm_size = 512
    emb_dim = 512
    crop_size = 1500
@@ -114,7 +117,7 @@ def get_model(args):
    train_reader = batch(
        paddle.reader.shuffle(
            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
    test_reader = batch(
        paddle.reader.shuffle(
            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),

--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -22,6 +22,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse
 import functools
+import os
 def vgg16_bn_drop(input):
@@ -65,8 +66,23 @@ def get_model(args):
        else:
            data_shape = [224, 224, 3]
-    # Input data
+    if args.use_reader_op:
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + data_shape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
+        images = fluid.layers.data(name='data', shape=dshape, dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
@@ -95,7 +111,7 @@ def get_model(args):
            paddle.dataset.cifar.train10()
            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
            buf_size=5120),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),

--- a/benchmark/fluid/recordio_converter.py
+++ b/benchmark/fluid/recordio_converter.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import random
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.dataset import mnist, cifar, flowers, image
+def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
+                       shape_label):
+    num_batches = 0
+    with fluid.program_guard(fluid.Program(), fluid.Program()):
+        reader = paddle.batch(py_reader(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(
+            feed_list=[  # order is image and label
+                fluid.layers.data(
+                    name='image', shape=shape_data),
+                fluid.layers.data(
+                    name='label', shape=shape_label, dtype='int64'),
+            ],
+            place=fluid.CPUPlace())
+        num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+            outfilepath, reader, feeder)
+    return num_batches
+def prepare_mnist(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "mnist.recordio")
+    convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
+def prepare_cifar10(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "cifar.recordio")
+    convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
+def prepare_flowers(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "flowers.recordio")
+    convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
+                       [1])
+def default_mapper(sample):
+    img, label = sample
+    img = image.simple_transform(
+        img, 256, 224, True, mean=[103.94, 116.78, 123.68])
+    return img.flatten().astype('float32'), label
+def imagenet_train(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "train.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+    # shuffle all, this is slow
+    random.shuffle(imgfilelist)
+    def train_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            data = image.load_image(
+                os.path.join(data_dir, "train", imgfile.lower()))
+            label = [img2label[imgfile], ]
+            yield [data, label]
+    return paddle.reader.map_readers(default_mapper, train_reader)
+def imagenet_test(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "val.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+    def test_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
+            image_path = ".".join([base_path, "jpeg"])
+            data = image.load_image(image_path)
+            label = [img2label[imgfile], ]
+            yield [data, label]
+    return paddle.reader.map_readers(default_mapper, test_reader)
+# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
+def convert_reader_to_recordio_files(
+        filename,
+        batch_per_file,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    f_name, f_ext = os.path.splitext(filename)
+    assert (f_ext == ".recordio")
+    lines = []
+    f_idx = 0
+    counter = 0
+    for idx, batch in enumerate(reader_creator()):
+        lines.append(batch)
+        if idx >= batch_per_file and idx % batch_per_file == 0:
+            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
+            with fluid.recordio_writer.create_recordio_writer(
+                    filename, compressor, max_num_records) as writer:
+                for l in lines:
+                    res = feeder.feed(l)
+                    for each in feed_order:
+                        writer.append_tensor(res[each])
+                    writer.complete_append_tensor()
+                    counter += 1
+                lines = []
+                f_idx += 1
+            print("written file: ", filename)
+    return counter
+def prepare_imagenet(inpath, outpath, batch_size):
+    r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            fluid.layers.data(
+                name="image", shape=[3, 224, 224]), fluid.layers.data(
+                    name="label", shape=[1], dtype='int64')
+        ],
+        place=fluid.CPUPlace())
+    outpath = os.path.join(outpath, "imagenet.recordio")
+    convert_reader_to_recordio_files(outpath, 10000, r, feeder)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -92,6 +92,9 @@ if(WITH_GPU)
        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
            message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
        endif()
+        if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
+            message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
+        endif()
        include_directories(${TENSORRT_INCLUDE_DIR})
    endif()
 elseif(WITH_AMD_GPU)

--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -24,31 +24,37 @@ set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
 set(inference_deps paddle_inference_api paddle_fluid_api)
 # if anakin is set enable anakin api implementation
-if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
+if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
    set(ANAKIN_FOUND ON)
 else()
    set(ANAKIN_FOUND OFF)
 endif()
+function(fetch_include_recursively root_dir) 
+    if (IS_DIRECTORY ${root_dir}) 
+        include_directories(${root_dir})
+    endif()
+    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
+    foreach(sub ${ALL_SUB})
+        if (IS_DIRECTORY ${root_dir}/${sub})
+            fetch_include_recursively(${root_dir}/${sub})
+        endif()
+    endforeach()
+endfunction()
 if (ANAKIN_FOUND)
    # Anakin's code style doesn't follow google c style.
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
-                                            -Wno-error=reorder
-                                            -Wno-error=format
-                                            -Wno-error=switch
-                                            -Wno-error=return-type
-                                            -Wno-error=non-virtual-dtor
-                                            -Wno-error=cpp")
    message(STATUS "Anakin for inference is enabled")
    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
-    include_directories("${ANAKIN_INCLUDE}")
+    fetch_include_recursively(${ANAKIN_INCLUDE})
-    # Anakin's source path is a mass, need to set sub-directories trivially.
-    include_directories("${ANAKIN_INCLUDE}/saber")
+    link_directories(${ANAKIN_LIBRARY})
-    link_directories("${ANAKIN_LIBRARY}")
-    nv_library(inference_anakin_api SRCS paddle_inference_api_anakin_engine.cc)
+    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
-    target_link_libraries(inference_anakin_api anakin)
+    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
    list(APPEND inference_deps inference_anakin_api)
 endif()
@@ -84,8 +90,8 @@ inference_api_test(test_paddle_inference_api_impl
                    ARGS test_word2vec test_image_classification)
 if (ANAKIN_FOUND)
-  nv_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
-    DEPS ${inference_deps} protobuf)
+    DEPS ${inference_deps})
 endif()
 if(WITH_TESTING)

--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <memory>
+#include <thread>
 #include "paddle/contrib/inference/paddle_inference_api.h"
 namespace paddle {
 namespace demo {
@@ -61,13 +61,67 @@ void Main(bool use_gpu) {
    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
      LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
    }
+    // TODO(Superjomn): this is should be free automatically
+    free(outputs[0].data.data);
+  }
+}
+void MainThreads(int num_threads, bool use_gpu) {
+  // Multi-threads only support on CPU
+  // 0. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto main_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_threads; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // 1. clone a predictor which shares the same parameters
+      auto predictor = main_predictor->Clone();
+      constexpr int num_batches = 3;
+      for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
+        // 2. Dummy Input Data
+        int64_t data[4] = {1, 2, 3, 4};
+        PaddleBuf buf{.data = data, .length = sizeof(data)};
+        PaddleTensor tensor{.name = "",
+                            .shape = std::vector<int>({4, 1}),
+                            .data = buf,
+                            .dtype = PaddleDType::INT64};
+        std::vector<PaddleTensor> inputs(4, tensor);
+        std::vector<PaddleTensor> outputs;
+        // 3. Run
+        CHECK(predictor->Run(inputs, &outputs));
+        // 4. Get output.
+        ASSERT_EQ(outputs.size(), 1UL);
+        LOG(INFO) << "TID: " << tid << ", "
+                  << "output buffer size: " << outputs.front().data.length;
+        const size_t num_elements = outputs.front().data.length / sizeof(float);
+        // The outputs' buffers are in CPU memory.
+        for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+          LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+        }
+        free(outputs[0].data.data);
+      }
+    });
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i].join();
  }
 }
 TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
 #ifdef PADDLE_WITH_CUDA
 TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
 #endif
 }  // namespace demo

--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -113,5 +113,4 @@ struct AnakinConfig : public PaddlePredictor::Config {
 // Similarly, each engine kind should map to a unique predictor implementation.
 template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 }  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -24,8 +24,16 @@ PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
 }
 bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
-  // TODO(Superjomn) Tell anakin to support return code.
+  if (!(graph_.load(config.model_file))) {
-  engine_.Build(config.model_file, config.max_batch_size);
+    return false;
+  }
+  graph_.ResetBatchSize("input_0", config.max_batch_size);
+  // optimization for graph
+  if (!(graph_.Optimize())) {
+    return false;
+  }
+  // construct executer
+  executor_.init(graph_);
  return true;
 }
@@ -38,24 +46,30 @@ bool PaddleInferenceAnakinPredictor::Run(
                 << "'s type is not float";
      return false;
    }
-    engine_.SetInputFromCPU(
+    auto d_tensor_in_p = executor_.get_in(input.name);
-        input.name, static_cast<float *>(input.data.data), input.data.length);
+    float *d_data_p = d_tensor_in_p->mutable_data();
+    if (cudaMemcpy(d_data_p,
+                   static_cast<float *>(input.data.data),
+                   d_tensor_in_p->valid_size() * sizeof(float),
+                   cudaMemcpyHostToDevice) != 0) {
+      LOG(ERROR) << "copy data from CPU to GPU error";
+      return false;
+    }
  }
-  // TODO(Superjomn) Tell anakin to support return code.
+  executor_.prediction();
-  engine_.Execute();
  if (output_data->empty()) {
    LOG(ERROR) << "At least one output should be set with tensors' names.";
    return false;
  }
  for (auto &output : *output_data) {
-    auto *tensor = engine_.GetOutputInGPU(output.name);
+    auto *tensor = executor_.get_out(output.name);
    output.shape = tensor->shape();
    // Copy data from GPU -> CPU
    if (cudaMemcpy(output.data.data,
-                   tensor->data(),
+                   tensor->mutable_data(),
-                   tensor->size(),
+                   tensor->valid_size() * sizeof(float),
                   cudaMemcpyDeviceToHost) != 0) {
      LOG(ERROR) << "copy data from GPU to CPU error";
      return false;
@@ -64,9 +78,26 @@ bool PaddleInferenceAnakinPredictor::Run(
  return true;
 }
-// TODO(Superjomn) To implement latter.
+anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor::get_executer() {
+  return executor_;
+}
+// the cloned new Predictor of anakin share the same net weights from original
+// Predictor
 std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
+  VLOG(3) << "Anakin Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
+  // construct executer from other graph
+  auto anakin_predictor_p =
+      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
+  if (!anakin_predictor_p) {
+    LOG(ERROR) << "fail to call Init";
    return nullptr;
+  }
+  anakin_predictor_p->get_executer().init(graph_);
+  return std::move(cls);
 }
 // A factory to help create difference predictor.
@@ -74,6 +105,7 @@ template <>
 std::unique_ptr<PaddlePredictor>
 CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
    const AnakinConfig &config) {
+  VLOG(3) << "Anakin Predictor create.";
  std::unique_ptr<PaddlePredictor> x(
      new PaddleInferenceAnakinPredictor(config));
  return x;

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -20,32 +20,42 @@ limitations under the License. */
 #pragma once
 // NOTE This header file do not have namespace.
-// TODO(Superjomn) Tell Anakin to provide better APIs.
+//#include <test/framework/net/paddle_api.h>
-#include <test/framework/net/paddle_api.h>
 #include "paddle/contrib/inference/paddle_inference_api.h"
+#include "framework/core/net/net.h"
+#include "saber/saber_types.h"
 namespace paddle {
 class PaddleInferenceAnakinPredictor : public PaddlePredictor {
 public:
+  PaddleInferenceAnakinPredictor() {}
  PaddleInferenceAnakinPredictor(const AnakinConfig& config);
  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
  // should be allocated first.
-  // TODO(Superjomn) should unify all the behaviors of output_data accross all
-  // the engines.
  bool Run(const std::vector<PaddleTensor>& inputs,
           std::vector<PaddleTensor>* output_data) override;
  std::unique_ptr<PaddlePredictor> Clone() override;
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  get_executer();
+  ~PaddleInferenceAnakinPredictor() override{};
 private:
  bool Init(const AnakinConfig& config);
-  anakin::AnakinEngine<anakin::NV,
+  anakin::graph::Graph<anakin::NV,
                       anakin::saber::AK_FLOAT,
                       anakin::Precision::FP32>
-      engine_;
+      graph_;
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+      executor_;
+  AnakinConfig config_;
 };
 }  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -12,16 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <glog/logging.h>
 #include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/contrib/inference/paddle_inference_api.h"
 namespace paddle {
-TEST(inference, anakin) {
+AnakinConfig GetConfig() {
  AnakinConfig config;
+  config.model_file = "./mobilenet_v2.anakin.bin";
+  config.device = 0;
+  config.max_batch_size = 1;
+  return config;
+}
-  auto engine =
+TEST(inference, anakin) {
+  AnakinConfig config = GetConfig();
+  auto predictor =
      CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+  float data[1 * 3 * 224 * 224] = {1.0f};
+  PaddleBuf buf{.data = data, .length = sizeof(data)};
+  PaddleTensor tensor{.name = "input_0",
+                      .shape = std::vector<int>({1, 3, 224, 224}),
+                      .data = buf,
+                      .dtype = PaddleDType::FLOAT32};
+  // For simplicity, we set all the slots with the same data.
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+  float data_out[1000];
+  PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
+  PaddleTensor tensor_out{.name = "prob_out",
+                          .shape = std::vector<int>({1000, 1}),
+                          .data = buf_out,
+                          .dtype = PaddleDType::FLOAT32};
+  std::vector<PaddleTensor> outputs(1, tensor_out);
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+  float* data_o = static_cast<float*>(outputs[0].data.data);
+  for (size_t j = 0; j < 1000; ++j) {
+    LOG(INFO) << "output[" << j << "]: " << data_o[j];
+  }
 }
 }  // namespace paddle
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <thread>
 #include "gflags/gflags.h"
 #include "paddle/contrib/inference/paddle_inference_api_impl.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
@@ -45,14 +47,19 @@ NativeConfig GetConfig() {
  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
  LOG(INFO) << "dirname  " << config.model_dir;
  config.fraction_of_gpu_memory = 0.15;
+#ifdef PADDLE_WITH_CUDA
  config.use_gpu = true;
+#else
+  config.use_gpu = false;
+#endif
  config.device = 0;
  return config;
 }
-TEST(paddle_inference_api_impl, word2vec) {
+void MainWord2Vec(bool use_gpu) {
  NativeConfig config = GetConfig();
  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
+  config.use_gpu = use_gpu;
  framework::LoDTensor first_word, second_word, third_word, fourth_word;
  framework::LoD lod{{0, 1}};
@@ -100,11 +107,12 @@ TEST(paddle_inference_api_impl, word2vec) {
  free(outputs[0].data.data);
 }
-TEST(paddle_inference_api_impl, image_classification) {
+void MainImageClassification(bool use_gpu) {
  int batch_size = 2;
  bool use_mkldnn = false;
  bool repeat = false;
  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
  config.model_dir =
      FLAGS_dirname + "image_classification_resnet.inference.model";
@@ -149,4 +157,143 @@ TEST(paddle_inference_api_impl, image_classification) {
  free(data);
 }
+void MainThreadsWord2Vec(bool use_gpu) {
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  // prepare inputs data and reference results
+  constexpr int num_jobs = 3;
+  std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // each job has 4 words
+    jobs[i].resize(4);
+    for (size_t j = 0; j < 4; ++j) {
+      framework::LoD lod{{0, 1}};
+      int64_t dict_size = 2073;  // The size of dictionary
+      SetupLoDTensor(&jobs[i][j], lod, static_cast<int64_t>(0), dict_size - 1);
+      paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j]));
+    }
+    // get reference result of each job
+    std::vector<paddle::framework::LoDTensor*> ref_feeds;
+    std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    for (auto& word : jobs[i]) {
+      ref_feeds.push_back(&word);
+    }
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+      // check outputs range
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length;
+      float* data = static_cast<float*>(local_outputs[0].data.data);
+      for (size_t j = 0; j < len / sizeof(float); ++j) {
+        ASSERT_LT(data[j], 1.0);
+        ASSERT_GT(data[j], -1.0);
+      }
+      // check outputs correctness
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+      }
+      free(data);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+void MainThreadsImageClassification(bool use_gpu) {
+  constexpr int num_jobs = 4;  // each job run 1 batch
+  constexpr int batch_size = 1;
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  config.model_dir =
+      FLAGS_dirname + "image_classification_resnet.inference.model";
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  std::vector<framework::LoDTensor> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // prepare inputs
+    std::vector<std::vector<int64_t>> feed_target_shapes =
+        GetFeedTargetShapes(config.model_dir, /*is_combined*/ false);
+    feed_target_shapes[0][0] = batch_size;
+    framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
+    SetupTensor<float>(&jobs[i], input_dims, 0.f, 1.f);
+    paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i]));
+    // get reference result of each job
+    std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]);
+    std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+      // check outputs correctness
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length;
+      float* data = static_cast<float*>(local_outputs[0].data.data);
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+      }
+      free(data);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); }
+TEST(inference_api_native, word2vec_cpu_threads) {
+  MainThreadsWord2Vec(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu_threads) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+#ifdef PADDLE_WITH_CUDA
+TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
+TEST(inference_api_native, word2vec_gpu_threads) {
+  MainThreadsWord2Vec(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu_threads) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+#endif
 }  // namespace paddle
--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
@@ -27,6 +27,7 @@ enum class DataLayout {
  kNHWC = 0,
  kNCHW = 1,
  kAnyLayout = 2,
+  kMKLDNN = 3,  // all layouts supported by MKLDNN internally
 };
 inline DataLayout StringToDataLayout(const std::string& str) {
@@ -41,6 +42,8 @@ inline DataLayout StringToDataLayout(const std::string& str) {
    return DataLayout::kNCHW;
  } else if (s == "ANYLAYOUT") {
    return DataLayout::kAnyLayout;
+  } else if (s == "MKLDNNLAYOUT") {
+    return DataLayout::kMKLDNN;
  } else {
    PADDLE_THROW("Unknown storage order string: %s", s);
  }
@@ -54,8 +57,10 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) {
      return "NCHW";
    case DataLayout::kAnyLayout:
      return "ANY_LAYOUT";
+    case DataLayout::kMKLDNN:
+      return "MKLDNNLAYOUT";
    default:
-      PADDLE_THROW("unknown DataLayou %d", data_layout);
+      PADDLE_THROW("unknown DataLayout %d", data_layout);
  }
 }

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -16,6 +16,9 @@
 #include <vector>
 #include "paddle/fluid/operators/math/math_function.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace framework {
@@ -88,5 +91,85 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var,
  out->set_layout(expected_kernel_type.data_layout_);
 }
+#ifdef PADDLE_WITH_MKLDNN
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
+  switch (type) {
+    case mkldnn::memory::data_type::f32:
+      return platform::to_void_cast(tensor.data<float>());
+    case mkldnn::memory::data_type::s8:
+      return platform::to_void_cast(tensor.data<char>());
+    case mkldnn::memory::data_type::u8:
+      return platform::to_void_cast(tensor.data<unsigned char>());
+    case mkldnn::memory::data_type::s16:
+      return platform::to_void_cast(tensor.data<int16_t>());
+    case mkldnn::memory::data_type::s32:
+      return platform::to_void_cast(tensor.data<int32_t>());
+    default:
+      PADDLE_THROW("wrong mkldnn type provided");
+  }
+}
+#endif
+void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
+                               const OpKernelType& expected_kernel_type,
+                               const Tensor& in, Tensor* out) {
+  auto in_layout = kernel_type_for_var.data_layout_;
+  auto out_layout = expected_kernel_type.data_layout_;
+  PADDLE_ENFORCE(
+      in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN,
+      "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
+      "non-MKLDNN");
+#ifdef PADDLE_WITH_MKLDNN
+  PADDLE_ENFORCE(in.format() != memory::format::format_undef &&
+                     in.format() != memory::format::any,
+                 "Input tensor should have specified memory format");
+  // Set default as NCHW in case not specified
+  out_layout =
+      out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
+      pool.Get(expected_kernel_type.place_));
+  auto& cpu_engine = dev_ctx->GetEngine();
+  std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
+  std::vector<int> out_tz = in_tz;
+  memory::data_type in_type = ToMKLDNNDataType(in.type());
+  PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
+                 "Input tensor type is not supported: ", in.type().name());
+  memory::data_type out_type = in_type;
+  memory::format in_format =
+      in_tz.size() == 2 ? memory::format::nc : in.format();
+  memory::format out_format =
+      out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout);
+  void* in_data = GetDataFromTensor(in, in_type);
+  // output tensor has the same dims as input. Reorder don't change dims
+  out->Resize(in.dims());
+  auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
+  auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+  auto out_memory =
+      memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+  platform::Reorder(in_memory, out_memory);
+  out->set_layout(out_layout);
+  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
+  out->set_format(memory::format::format_undef);
+#endif
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <map>
 #include <vector>
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -22,6 +23,50 @@
 namespace paddle {
 namespace framework {
+#ifdef PADDLE_WITH_MKLDNN
+using MKLDNNFormat = mkldnn::memory::format;
+using MKLDNNDataType = mkldnn::memory::data_type;
+inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) {
+  switch (layout) {
+    case DataLayout::kNHWC:
+      return MKLDNNFormat::nhwc;
+    case DataLayout::kNCHW:
+      return MKLDNNFormat::nchw;
+    default:
+      PADDLE_THROW("Fail to convert layout %s to MKLDNN format",
+                   DataLayoutToString(layout));
+  }
+}
+inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) {
+  switch (format) {
+    case MKLDNNFormat::nhwc:
+      return DataLayout::kNHWC;
+    case MKLDNNFormat::nchw:
+      return DataLayout::kNCHW;
+    default:
+      PADDLE_THROW("Fail to convert MKLDNN format to paddle layout");
+  }
+}
+inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
+  static const std::map<std::type_index, MKLDNNDataType> dict{
+      {std::type_index(typeid(float)), MKLDNNDataType::f32},  // NOLINT
+      {std::type_index(typeid(char)), MKLDNNDataType::s8},    // NOLINT
+      {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8},
+      {std::type_index(typeid(int16_t)), MKLDNNDataType::s16},
+      {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}};
+  auto iter = dict.find(type);
+  if (iter != dict.end()) return iter->second;
+  return MKLDNNDataType::data_undef;
+}
+#endif
+void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
+                               const OpKernelType& expected_kernel_type,
+                               const Tensor& in, Tensor* out);
 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
 void TransDataLayout(const OpKernelType& kernel_type_for_var,

--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -33,11 +33,34 @@ void DataTransform(const OpKernelType& expected_kernel_type,
  Tensor in;
  in.ShareDataWith(input_tensor);
  Tensor out;
+  DataLayout lin = kernel_type_for_var.data_layout_;
+  DataLayout lout = expected_kernel_type.data_layout_;
  // do layout transform
-  if (NeedTransformLayout(expected_kernel_type.data_layout_,
+  if (NeedTransformLayout(lout, lin)) {
-                          kernel_type_for_var.data_layout_)) {
+    if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) {
+      PADDLE_ENFORCE(
+          !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN),
+          "No layout transform needed between two MKLDNN OPKernels");
+      if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) {
+#ifdef PADDLE_WITH_MKLDNN
+        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
+        // Just set layout/format. No real transform occur
+        out.ShareDataWith(input_tensor);
+        out.set_layout(DataLayout::kMKLDNN);
+        out.set_format(ToMKLDNNFormat(lin));
+#endif
+      } else {
+        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
+        // Do transform via MKLDNN lib
+        TransDataLayoutFromMKLDNN(kernel_type_for_var, expected_kernel_type, in,
+                                  &out);
+      }
+    } else {
+      // Case3 - transfrom between Non-MKLDNN OPKernels
      TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+    }
    transformed = true;
    PassTensorData(&out, &in);
  }

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -13,7 +13,7 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro
 if(WITH_GPU)
    nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda)
+            dynload_cuda variable_visitor)
    set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
@@ -25,6 +25,7 @@ else()
 endif()
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)

--- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+void FuseVarsOpHandle::RunImpl() {
+  WaitInputVarGenerated(place_);
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0);
+  PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
+  auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  auto out_var_handle = out_var_handles[0];
+  auto out_var = scope->Var(out_var_handle->name_);
+  auto out_tensor = out_var->GetMutable<LoDTensor>();
+  out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_);
+  int64_t s = 0;
+  for (size_t i = 1; i < out_var_handles.size(); ++i) {
+    auto out_name = out_var_handles[i]->name_;
+    auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>();
+    auto numel = this->inputs_numel_.at(out_name);
+    out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
+    s += numel;
+  }
+  this->RunAndRecordEvent([this] {});
+}
+std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.h
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct FuseVarsOpHandle : public OpHandleBase {
+ public:
+  FuseVarsOpHandle(Scope *local_scope, const platform::Place &place,
+                   const std::unordered_map<std::string, int64_t> &inputs_numel,
+                   const std::type_index &var_type)
+      : local_scope_(local_scope),
+        place_(place),
+        inputs_numel_(inputs_numel),
+        type_(var_type) {
+    total_numel_ = 0;
+    for (auto in_numel : inputs_numel) {
+      PADDLE_ENFORCE_GT(in_numel.second, 0);
+      total_numel_ += in_numel.second;
+    }
+  }
+  std::string Name() const override;
+  bool IsMultiDeviceTransfer() override { return false; };
+ protected:
+  void RunImpl() override;
+ private:
+  Scope *local_scope_;
+  const platform::Place place_;
+  const std::unordered_map<std::string, int64_t> inputs_numel_;
+  const std::type_index type_;
+  int64_t total_numel_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -230,7 +230,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
      if (op->Type() == "concat") {
        auto got = remote_vars_devices_.find(op->InputArgumentNames()[0]);
        PADDLE_ENFORCE(got != remote_vars_devices_.end(),
-                       "can not find right place to concat received var.");
+                       "can not find right place to concatenate received var.");
        CreateDistTrainOp(&result, *op, got->second);
      } else {
        CreateDistTrainOp(&result, *op, 0);
@@ -503,10 +503,9 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
 }
 void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, const OpDesc &op,
-                                          int place_id) const {
+                                          int device_id) const {
-  auto &p = places_[place_id];
+  result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[device_id],
-  auto *s = local_scopes_[place_id];
+                                            op.Type(), places_[device_id]));
-  result->ops_.emplace_back(new RPCOpHandle(op, s, p, op.Type()));
  if (op.Type() == "send_barrier") {
    ConnectOp(result, result->ops_.back().get(), "send_vars");
@@ -524,7 +523,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, const OpDesc &op,
  // TODO(Yancey1989): schedule rpc op on different place may
  // increate throughput
-  CreateOpHandleIOs(result, op, place_id);
+  CreateOpHandleIOs(result, op, device_id);
 }
 bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -58,7 +58,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 private:
  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
-                         size_t place_id) const;
+                         size_t device_id) const;
 private:
  std::string loss_var_name_;

--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -11,10 +11,12 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <algorithm>
+#include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
-#include <algorithm>
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
 namespace paddle {
 namespace framework {
@@ -30,27 +32,34 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
 }
 void NCCLAllReduceOpHandle::RunImpl() {
-  if (inputs_.size() == 1) {
+  if (NoDummyInputSize() == 1) {
    return;  // No need to all reduce when GPU count = 1;
  } else {
    // Wait input done
    WaitInputVarGenerated();
+    auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-    auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_;
+    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-    int dtype = -1;
+    PADDLE_ENFORCE_EQ(
-    size_t numel = 0;
+        in_var_handles.size(), places_.size(),
+        "The NoDummyInputSize should be equal to the number of places.");
+    PADDLE_ENFORCE_EQ(
+        in_var_handles.size(), out_var_handles.size(),
+        "The NoDummyInputSize and NoDummyOutputSize should be equal.");
    std::vector<const LoDTensor *> lod_tensors;
    for (size_t i = 0; i < local_scopes_.size(); ++i) {
      auto *s = local_scopes_[i];
      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
+      auto &lod_tensor =
-      auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
+          local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
      lod_tensors.emplace_back(&lod_tensor);
+      PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
+                        "The name of input and output should be equal.");
    }
    if (platform::is_gpu_place(lod_tensors[0]->place())) {
+      int dtype = -1;
+      size_t numel = 0;
      std::vector<std::function<void()>> all_reduce_calls;
      for (size_t i = 0; i < local_scopes_.size(); ++i) {
        auto &p = places_[i];
@@ -96,7 +105,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
        auto &scope =
            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
        auto &p = places_[i];
-        auto *var = scope.FindVar(var_name);
+        auto *var = scope.FindVar(in_var_handles[i]->name_);
        auto *dev_ctx = dev_ctxes_[p];
        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -104,6 +104,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
  }
 }
+size_t OpHandleBase::NoDummyInputSize() const {
+  size_t cnt = 0;
+  for (auto *in : inputs_) {
+    if (dynamic_cast<DummyVarHandle *>(in) == nullptr) {
+      ++cnt;
+    }
+  }
+  return cnt;
+}
 bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
  return in_var && in_var->generated_op_;
 }

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -80,6 +80,8 @@ class OpHandleBase {
  const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
+  size_t NoDummyInputSize() const;
 protected:
  void RunAndRecordEvent(const std::function<void()> &callback);

--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -19,12 +19,12 @@ namespace framework {
 namespace details {
 RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
-                         const Scope *local_scope, const platform::Place &place,
+                         const Scope *local_scope, const std::string &name,
-                         const std::string &name)
+                         const platform::Place &place)
    : op_(framework::OpRegistry::CreateOp(op_desc)),
      local_scope_(local_scope),
-      place_(place),
+      name_(name),
-      name_(name) {}
+      place_(place) {}
 void RPCOpHandle::RunImpl() {
  // TODO(wuyi): need further analysis whether wait VarDummyHandle.

--- a/paddle/fluid/framework/details/rpc_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -29,7 +29,7 @@ namespace details {
 struct RPCOpHandle : public OpHandleBase {
  RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
-              const platform::Place& place, const std::string& name);
+              const std::string& name, const platform::Place& place);
  std::string Name() const override;
@@ -43,8 +43,8 @@ struct RPCOpHandle : public OpHandleBase {
 private:
  std::unique_ptr<OperatorBase> op_;
  const Scope* local_scope_;
-  const platform::Place& place_;
  const std::string name_;
+  platform::Place place_;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -30,7 +30,9 @@ class SSAGraphBuilder {
  SSAGraphBuilder() {}
  virtual ~SSAGraphBuilder() {}
  virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
-  virtual int GetRemoteVarDeviceId(const std::string &var_name) const = 0;
+  virtual int GetRemoteVarDeviceId(const std::string &var_name) const {
+    return -1;
+  }
  DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);

--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace framework {
 namespace details {
-class SSAGraph;
+struct SSAGraph;
 class SSAGraphPrinter {
 public:
  virtual ~SSAGraphPrinter() {}

--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -87,7 +87,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
 }
 inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
-  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
+  bool ret =
+      (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r);
+#ifdef PADDLE_WITH_MKLDNN
+  // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa
+  ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN);
+  ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN);
+#endif
+  return ret;
 }
 inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -83,8 +83,14 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
  void operator()(const char* op_type, const char* library_type) const {
    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
+    std::string library(library_type);
+    std::string data_layout = "ANYLAYOUT";
+    if (library == "MKLDNN") {
+      data_layout = "MKLDNNLAYOUT";
+    }
    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
-                     DataLayout::kAnyLayout, StringToLibraryType(library_type));
+                     StringToDataLayout(data_layout),
+                     StringToLibraryType(library_type));
    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
@@ -99,7 +105,8 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
  void operator()(const char* op_type, const char* library_type) const {}
 };
-// User can register many kernel in one place. The data type could be different.
+// User can register many kernel in one place. The data type could be
+// different.
 template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
 public:
@@ -149,15 +156,15 @@ class OpKernelRegistrar : public Registrar {
 /**
 * Macro to register OperatorKernel.
 */
-#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...)        \
+#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)        \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__,                      \
+      __reg_op_kernel_##op_type##_##library_type##__,                      \
      "REGISTER_OP_KERNEL must be called in global namespace");            \
  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
-      __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type,       \
+      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,       \
-                                                           #LIBRARY_TYPE); \
+                                                           #library_type); \
-  int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() {                \
+  int TouchOpKernelRegistrar_##op_type##_##library_type() {                \
-    __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch();          \
+    __op_kernel_registrar_##op_type##_##library_type##__.Touch();          \
    return 0;                                                              \
  }

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -293,6 +293,38 @@ static Tensor* GetMutableTensorFromVar(Variable* var) {
  }
 }
+bool ExecutionContext::HasInput(const std::string& name) const {
+  if (!op_.HasInputs(name)) {
+    return false;
+  }
+  auto& ins = Inputs(name);
+  size_t length = ins.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input %s should not have more than one inputs", name);
+  auto arg = ins[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
+bool ExecutionContext::HasOutput(const std::string& name) const {
+  if (!op_.HasOutputs(name)) {
+    return false;
+  }
+  auto& outs = Outputs(name);
+  size_t length = outs.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Output %s should not have more than one inputs", name);
+  auto arg = outs[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
  auto* var = InputVar(name);
@@ -444,9 +476,24 @@ class RuntimeInferShapeContext : public InferShapeContext {
    auto* out_tensor = out_var->GetMutable<LoDTensor>();
    out_tensor->set_lod(in_tensor.lod());
-    // TODO(dzhwinter) : reuse ShareLoD in most operators.
+// TODO(dzhwinter) : reuse ShareLoD in most operators.
-    // Need to call ShareLayout explicitly in sequence related ops.
+// Need to call ShareLayout explicitly in sequence related ops.
-    // Shall we have a better method to shared info between in/out Tensor?
+// Shall we have a better method to shared info between in/out Tensor?
+#ifdef PADDLE_WITH_MKLDNN
+    // Fix me: ugly workaround below
+    // Correct solution:
+    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
+    //    layout of output tensor should be set "manually" in Compute()
+    //    of each OPKernel. The reason layout should NOT be shared between
+    //    input and output "automatically" (now by InferShape()->ShareLoD())
+    //    is that layout transform may occur after InferShape().
+    // Workaround:
+    //    Skip set_layout() when input layout is kMKLDNN
+    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
+    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
+    //    in Compute()
+    if (in_tensor.layout() != DataLayout::kMKLDNN)
+#endif
      out_tensor->set_layout(in_tensor.layout());
  }
@@ -646,8 +693,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
        }
        if (t != nullptr) {
          int tmp = static_cast<int>(ToDataType(t->type()));
-          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+          PADDLE_ENFORCE(
-                         "DataType of Paddle Op %s must be the same.", Type());
+              tmp == data_type || data_type == -1,
+              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
+              data_type, tmp);
          data_type = tmp;
        }
      }
@@ -665,7 +714,8 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
 OpKernelType OperatorWithKernel::GetKernelTypeForVar(
    const std::string& var_name, const Tensor& tensor,
    const OpKernelType& expected_kernel_type) const {
-  return OpKernelType(expected_kernel_type.data_type_, tensor.place());
+  return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
+                      tensor.layout());
 }
 }  // namespace framework

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -191,9 +191,9 @@ class ExecutionContext {
    return op_.Attr<T>(name);
  }
-  bool HasInput(const std::string& name) const { return op_.HasInputs(name); }
+  bool HasInput(const std::string& name) const;
-  bool HasOutput(const std::string& name) const { return op_.HasOutputs(name); }
+  bool HasOutput(const std::string& name) const;
  size_t InputSize(const std::string& name) const {
    return op_.Inputs(name).size();

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -111,11 +111,7 @@ ParallelExecutor::ParallelExecutor(
 #ifdef PADDLE_WITH_CUDA
  builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
 #endif
-  builder_.reset(builder_factory.Create().get());
+  builder_ = std::move(builder_factory.Create());
-  if (builder_.get() == nullptr) {
-    VLOG(3) << "builder is null.";
-  }
  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, places,
      builder_->Build(main_program)));

--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -34,6 +34,28 @@ namespace framework {
 class LoDTensor;
 class Tensor {
+#ifdef PADDLE_WITH_MKLDNN
+ public:
+  inline mkldnn::memory::format format() const { return format_; }
+  inline void set_format(const mkldnn::memory::format format) {
+    format_ = format;
+  }
+ protected:
+  /**
+   * @brief the detail format of memory block which have layout as kMKLDNN
+   *
+   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
+   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
+   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
+   *       this field.
+   */
+  mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
+#endif
 public:
  template <typename T, size_t D, int MajorType, typename IndexType>
  friend struct EigenTensor;
@@ -195,8 +217,10 @@ class Tensor {
   *       N,C,H,W for respectively the batch size, the number of
   *       feature maps, the height.
   */
+  // Fix me: here just change the default layout to kNCHW
-  DataLayout layout_ = DataLayout::kNHWC;
+  // it doesn't fix the real issue, i.e. feeder should set up tensor layout
+  // according to actual input data
+  DataLayout layout_ = DataLayout::kNCHW;
  /**
   * @brief   A PlaceHolder may be shared by more than one tensor.

--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -209,7 +209,7 @@ TEST(Tensor, ReshapeToMatrix) {
 TEST(Tensor, Layout) {
  framework::Tensor src;
-  ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
+  ASSERT_EQ(src.layout(), framework::DataLayout::kNCHW);
  src.set_layout(framework::DataLayout::kAnyLayout);
  ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
 }
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
 # Add TRT tests
-# This test is not stable
-# See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828 
-#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
-#    DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
-#    SERIAL)
 nv_library(tensorrt_converter
  SRCS mul_op.cc conv2d_op.cc fc_op.cc
  DEPS tensorrt_engine mul_op)
@@ -16,3 +11,5 @@ nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -22,7 +22,8 @@ namespace tensorrt {
 class ReluOpConverter : public OpConverter {
 public:
  ReluOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
    // Here the two nullptr looks strange, that's because the
    // framework::OpDesc's constructor is strange.
    framework::OpDesc op_desc(op, nullptr);
@@ -33,7 +34,12 @@ class ReluOpConverter : public OpConverter {
    nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
        engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
        nvinfer1::ActivationType::kRELU);
-    engine_->SetITensor(op_desc.Output("Out")[0], layer->getOutput(0));
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
  }
 };

--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
+   Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+   you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
+   Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
+   distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+   See the License for the specific language governing permissions and
-limitations under the License. */
+   limitations under the License. */
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/place.h"
-USE_OP(relu);
 namespace paddle {
 namespace inference {
 namespace tensorrt {
-void Compare(const std::string op_type, float input, float expect) {
+TEST(ReluOpConverter, main) {
  framework::Scope scope;
-  platform::CUDAPlace place;
+  std::unordered_set<std::string> parameters;
-  platform::CUDADeviceContext ctx(place);
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6));
-  // init fluid op and variable
+  validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6));
-  auto x_var = scope.Var("X");
-  auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
+  // Prepare Op description
-  x_tensor->Resize({1, 1});
+  framework::OpDesc desc;
-  x_tensor->mutable_data<float>(place);
+  desc.SetType("relu");
-  std::vector<float> init;
+  desc.SetInput("X", {"relu-X"});
-  init.push_back(input);
+  desc.SetOutput("Out", {"relu-Out"});
-  framework::TensorFromVector(init, ctx, x_tensor);
+  LOG(INFO) << "set OP";
-  auto out_var = scope.Var("Out");
+  validator.SetOp(*desc.Proto());
-  auto out_tensor = out_var->GetMutable<framework::LoDTensor>();
+  LOG(INFO) << "execute";
-  out_tensor->Resize({1, 1});
-  out_tensor->mutable_data<float>(place);
+  validator.Execute(10);
-  framework::OpDesc op_desc;
-  op_desc.SetType(op_type);
-  op_desc.SetInput("X", {"X"});
-  op_desc.SetOutput("Out", {"Out"});
-  auto op = framework::OpRegistry::CreateOp(*op_desc.Proto());
-  // run fluid op
-  op->Run(scope, place);
-  // get fluid output
-  std::vector<float> out1;
-  framework::TensorToVector(*out_tensor, ctx, &out1);
-  // init tensorrt op
-  cudaStream_t stream;
-  ASSERT_EQ(0, cudaStreamCreate(&stream));
-  TensorRTEngine* engine = new TensorRTEngine(1, 1 << 10, &stream);
-  engine->InitNetwork();
-  engine->DeclareInput("X", nvinfer1::DataType::kFLOAT,
-                       nvinfer1::DimsCHW{1, 1, 1});
-  // convert op
-  OpConverter op_converter;
-  op_converter.ConvertOp(*op_desc.Proto(), engine);
-  engine->DeclareOutput("Out");
-  engine->FreezeNetwork();
-  // convert LoDTensor to ITensor
-  size_t size = x_tensor->memory_size();
-  EngineIOConverter::ConvertInput(op_type, *x_tensor,
-                                  engine->buffer("X").buffer, size, &stream);
-  // run tensorrt Outp
-  engine->Execute(1);
-  // convert ITensor to LoDTensor
-  EngineIOConverter::ConvertOutput(op_type, engine->buffer("Out").buffer,
-                                   out_tensor, size, &stream);
-  // get tensorrt output
-  std::vector<float> out2;
-  framework::TensorToVector(*out_tensor, ctx, &out2);
-  // compare
-  ASSERT_EQ(out1[0], out2[0]);
-  ASSERT_EQ(out1[0], expect);
-  delete engine;
-  cudaStreamDestroy(stream);
-}
-TEST(OpConverter, ConvertRelu) {
-  Compare("relu", 1, 1);   // relu(1) = 1
-  Compare("relu", -5, 0);  // relu(-5) = 0
 }
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
-USE_OP(activation);
+USE_OP(relu);
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -166,8 +166,6 @@ function(op_library TARGET)
      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
      if(${TARGET} STREQUAL "activation")
        file(APPEND ${pybind_file} "USE_OP(relu);\n")
-      elseif(${TARGET} STREQUAL "reduce")
-        file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
      elseif(${TARGET} STREQUAL "fake_dequantize")
        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
      else()

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -58,14 +58,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                      const framework::OperatorWithKernel& oper,
                                      const std::string& name) {
  framework::LibraryType library{framework::LibraryType::kPlain};
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
  auto it = oper.Attrs().find("use_mkldnn");
  if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() &&
      platform::CanMKLDNNBeUsed(ctx)) {
    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
  }
 #endif
-  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<framework::Tensor>(name)->type()),
      ctx.GetPlace(), layout, library);

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -111,14 +111,16 @@ class BatchNormOp : public framework::OperatorWithKernel {
                      "Variance input should be of float type");
    framework::LibraryType library_{framework::LibraryType::kPlain};
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
    if (library_ == framework::LibraryType::kPlain &&
        platform::CanMKLDNNBeUsed(ctx)) {
      library_ = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
    }
 #endif
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
                                   library_);
  }
@@ -367,17 +369,18 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
    }
    framework::LibraryType library_{framework::LibraryType::kPlain};
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
    if (library_ == framework::LibraryType::kPlain &&
        platform::CanMKLDNNBeUsed(ctx)) {
      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
    }
 #endif
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout, library_);
+        layout_, library_);
  }
 };

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -75,6 +75,11 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  framework::LibraryType library{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::DataLayout layout = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_CUDA
  if (platform::CanCUDNNBeUsed(ctx)) {
    library = framework::LibraryType::kCUDNN;
@@ -84,6 +89,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
  if (library == framework::LibraryType::kPlain &&
      platform::CanMKLDNNBeUsed(ctx)) {
    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
  }
 #endif
@@ -99,9 +105,6 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
                      "float16 can only be used when CUDNN is used");
  }
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout = framework::StringToDataLayout(data_format);
  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
                                 library);
 }
@@ -309,6 +312,10 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  framework::LibraryType library_{framework::LibraryType::kPlain};
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_CUDA
  if (platform::CanCUDNNBeUsed(ctx)) {
    library_ = framework::LibraryType::kCUDNN;
@@ -318,12 +325,10 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
  if (library_ == framework::LibraryType::kPlain &&
      platform::CanMKLDNNBeUsed(ctx)) {
    library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
  }
 #endif
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
      layout_, library_);

--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -43,7 +43,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType FCOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  framework::LibraryType library{framework::LibraryType::kMKLDNN};
-  framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
@@ -65,7 +65,7 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType FCOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  framework::LibraryType library{framework::LibraryType::kMKLDNN};
-  framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),

--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -124,16 +124,17 @@ namespace {
 framework::OpKernelType GetExpectedLRNKernel(
    const framework::ExecutionContext& ctx) {
  framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_MKLDNN
  if (library_ == framework::LibraryType::kPlain &&
      platform::CanMKLDNNBeUsed(ctx)) {
    library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
  }
 #endif
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
      layout_, library_);

--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -24,10 +24,13 @@ using mkldnn::pooling_backward;
 // Generate keys for storing/retriving primitives for this operator
 // TODO(jczaja): Make hashing function more optimial
-static std::string gethash(memory::dims& input_dims, std::string& pooling_type,
+static std::string gethash(const memory::dims& input_dims,
-                           std::vector<int>& ksize, std::vector<int>& strides,
+                           const std::string& pooling_type,
-                           std::vector<int>& paddings, std::string suffix) {
+                           const std::vector<int>& ksize,
-  auto dims2str = [](memory::dims& operand_dims) {
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::string& suffix) {
+  auto dims2str = [](const memory::dims& operand_dims) {
    std::string dstr = "";
    for (size_t i = 0; i < operand_dims.size(); ++i) {
      dstr += std::to_string(operand_dims[i]) + "-";

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -83,6 +83,9 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOp::GetExpectedKernelType(
    const framework::ExecutionContext &ctx) const {
  framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_CUDA
  if (platform::CanCUDNNBeUsed(ctx)) {
    library_ = framework::LibraryType::kCUDNN;
@@ -92,11 +95,10 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
  if (library_ == framework::LibraryType::kPlain &&
      platform::CanMKLDNNBeUsed(ctx)) {
    library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
  }
 #endif
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
      layout_, library_);
@@ -112,6 +114,9 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext &ctx) const {
  framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_CUDA
  if (platform::CanCUDNNBeUsed(ctx)) {
    library_ = framework::LibraryType::kCUDNN;
@@ -121,6 +126,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
  if (library_ == framework::LibraryType::kPlain &&
      platform::CanMKLDNNBeUsed(ctx)) {
    library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
  }
 #endif
@@ -129,8 +135,6 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
    PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
                      "float16 can only be used when CUDNN is used");
  }
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                 library_);
 }

--- a/paddle/fluid/operators/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_max_op.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+REGISTER_REDUCE_OP(reduce_max);
+REGISTER_OP_CPU_KERNEL(
+    reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MaxFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_max_op.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+REGISTER_OP_CUDA_KERNEL(reduce_max,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MaxFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_mean_op.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reduce_mean_op.h"
+REGISTER_REDUCE_OP(reduce_mean);
+REGISTER_OP_CPU_KERNEL(reduce_mean,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int64_t, ops::MeanFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_mean_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::MeanGradFunctor>);
--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reduce_mean_op.h"
+REGISTER_OP_CUDA_KERNEL(reduce_mean,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MeanFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MeanGradFunctor>);
--- a/paddle/fluid/operators/reduce_mean_op.h
+++ b/paddle/fluid/operators/reduce_mean_op.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/operators/reduce_op.h"
+namespace paddle {
+namespace operators {
+struct MeanFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->mean(dim);
+  }
+};
+struct MeanGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reduce_min_max_op.h
+++ b/paddle/fluid/operators/reduce_min_max_op.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/operators/reduce_op.h"
+namespace paddle {
+namespace operators {
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
+  }
+};
+struct MinFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->minimum(dim);
+  }
+};
+struct MaxOrMinGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
+    // If there are multiple minimum or maximum elements, the subgradient of
+    // each is the set [0, 1], and we pass gradient to all of them here.
+    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reduce_min_op.cc
+++ b/paddle/fluid/operators/reduce_min_op.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+REGISTER_REDUCE_OP(reduce_min);
+REGISTER_OP_CPU_KERNEL(
+    reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MinFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_min_op.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+REGISTER_OP_CUDA_KERNEL(reduce_min,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MinFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_op.cc
+++ b/paddle/fluid/operators/reduce_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/reduce_op.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-class ReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReduceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReduceOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      PADDLE_ENFORCE_LT(
-          dims[i], x_rank,
-          "The dim should be in the range [-rank(input), rank(input)).");
-    }
-    sort(dims.begin(), dims.end());
-    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
-    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
-    if (reduce_all) {
-      if (keep_dim)
-        ctx->SetOutputDim(
-            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
-      else
-        ctx->SetOutputDim("Out", {1});
-    } else {
-      auto dims_vector = vectorize(x_dims);
-      if (keep_dim) {
-        for (size_t i = 0; i < dims.size(); ++i) {
-          dims_vector[dims[i]] = 1;
-        }
-      } else {
-        const int kDelFlag = -2;
-        for (size_t i = 0; i < dims.size(); ++i) {
-          dims_vector[dims[i]] = kDelFlag;
-        }
-        dims_vector.erase(
-            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-            dims_vector.end());
-      }
-      auto out_dims = framework::make_ddim(dims_vector);
-      ctx->SetOutputDim("Out", out_dims);
-      if (dims[0] != 0) {
-        // Only pass LoD when not reducing on the first dim.
-        ctx->ShareLoD("X", /*->*/ "Out");
-      }
-    }
-  }
-};
-class ReduceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      PADDLE_ENFORCE_LT(
-          dims[i], x_rank,
-          "The dim should be in the range [-rank(input), rank(input)).");
-    }
-    sort(dims.begin(), dims.end());
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
-    }
-  }
-};
-class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final {
-    AddInput("X",
-             "(Tensor) The input tensor. Tensors with rank at most 6 are "
-             "supported.");
-    AddOutput("Out", "(Tensor) The result tensor.");
-    AddAttr<std::vector<int>>(
-        "dim",
-        "(list<int>, default {0}) The dimensions to reduce. "
-        "Must be in the range [-rank(input), rank(input)). "
-        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
-        "Note that reducing on the first dim will make the LoD info lost.")
-        .SetDefault({0});
-    AddAttr<bool>("keep_dim",
-                  "(bool, default false) "
-                  "If true, retain the reduced dimension with length 1.")
-        .SetDefault(false);
-    AddAttr<bool>("reduce_all",
-                  "(bool, default false) "
-                  "If true, output a scalar reduced along all dimensions.")
-        .SetDefault(false);
-    AddComment(string::Sprintf(R"DOC(
-%s Operator.
-This operator computes the %s of input tensor along the given dimension.
-The result tensor has 1 fewer dimension than the input unless keep_dim is true.
-If reduce_all is true, just reduce along all dimensions and output a scalar.
-)DOC",
-                               GetOpType(), GetName()));
-  }
- protected:
-  virtual std::string GetName() const = 0;
-  virtual std::string GetOpType() const = 0;
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-#define REGISTER_REDUCE_OP(op_name)                                        \
-  class __##op_name##Maker__ : public ops::ReduceOpMaker {                 \
-   protected:                                                              \
-    virtual std::string GetName() const { return #op_name; }               \
-    virtual std::string GetOpType() const { return "Reduce " #op_name; }   \
-  };                                                                       \
-  REGISTER_OPERATOR(reduce_##op_name, ops::ReduceOp, __##op_name##Maker__, \
-                    paddle::framework::DefaultGradOpDescMaker<true>);      \
-  REGISTER_OPERATOR(reduce_##op_name##_grad, ops::ReduceGradOp)
-REGISTER_REDUCE_OP(sum);
-REGISTER_REDUCE_OP(mean);
-REGISTER_REDUCE_OP(max);
-REGISTER_REDUCE_OP(min);
-REGISTER_REDUCE_OP(prod);
-#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
-  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           float, ops::functor>,               \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           double, ops::functor>,              \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int, ops::functor>,                 \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int64_t, ops::functor>);            \
-  REGISTER_OP_CPU_KERNEL(                                                      \
-      reduce_type##_grad,                                                      \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,        \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,           \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,       \
-                            ops::grad_functor>);
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
--- a/paddle/fluid/operators/reduce_op.cu
+++ b/paddle/fluid/operators/reduce_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#define EIGEN_USE_GPU
-#include "paddle/fluid/operators/reduce_op.h"
-namespace ops = paddle::operators;
-#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
-                                     float, ops::functor>,                \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, double,      \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int,         \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t,     \
-                        ops::functor>);                                   \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type##_grad,                                                 \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,  \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,     \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
-                            ops::grad_functor>);
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
@@ -14,105 +14,20 @@ limitations under the License. */
 #pragma once
+#include <algorithm>
+#include <string>
 #include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/reduce_op_function.h"
-#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-struct SumFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->sum(dim);
-  }
-};
-struct SumGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim);
-  }
-};
-struct MeanFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->mean(dim);
-  }
-};
-struct MeanGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
-  }
-};
-struct MaxFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->maximum(dim);
-  }
-};
-struct MinFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->minimum(dim);
-  }
-};
-struct MaxOrMinGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    auto equals = (*x) == y->broadcast(dim);
-    auto ones = dx->constant(1);
-    auto zeros = dx->constant(0);
-    // If there are multiple minimum or maximum elements, the subgradient of
-    // each is the set [0, 1], and we pass gradient to all of them here.
-    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
-  }
-};
-struct ProdFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->prod(dim);
-  }
-};
-struct ProdGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
-  }
-};
 #define HANDLE_DIM(NDIM, RDIM)                                            \
  if (ndim == NDIM && rdim == RDIM) {                                     \
-    ReduceCompute<NDIM, RDIM>(context); \
+    ReduceFunctor<DeviceContext, T, NDIM, RDIM, Functor>(                 \
+        context.template device_context<DeviceContext>(), *input, output, \
+        dims, keep_dim);                                                  \
  }
 template <typename DeviceContext, typename T, typename Functor>
@@ -120,11 +35,15 @@ class ReduceKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    bool reduce_all = context.Attr<bool>("reduce_all");
-    if (reduce_all) {
-      // Flatten and reduce 1-D tensor
    auto* input = context.Input<Tensor>("X");
    auto* output = context.Output<Tensor>("Out");
    output->mutable_data<T>(context.GetPlace());
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+    if (reduce_all) {
+      // Flatten and reduce 1-D tensor
      auto x = EigenVector<T>::Flatten(*input);
      auto out = EigenScalar<T>::From(*output);
      auto& place =
@@ -133,8 +52,8 @@ class ReduceKernel : public framework::OpKernel<T> {
      Functor functor;
      functor(place, &x, &out, reduce_dim);
    } else {
-      int ndim = context.Input<Tensor>("X")->dims().size();
+      int ndim = input->dims().size();
-      int rdim = context.Attr<std::vector<int>>("dim").size();
+      int rdim = dims.size();
      // comments for accelerating compiling temporarily.
      //      HANDLE_DIM(6, 5);
      //      HANDLE_DIM(6, 4);
@@ -154,48 +73,6 @@ class ReduceKernel : public framework::OpKernel<T> {
      HANDLE_DIM(1, 1);
    }
  }
- private:
-  template <size_t D, size_t R_D>
-  void ReduceCompute(const framework::ExecutionContext& context) const {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-    auto x = EigenTensor<T, D>::From(*input);
-    auto x_rank = static_cast<int>(x.dimensions().size());
-    auto dims = context.Attr<std::vector<int>>("dim");
-    auto reduce_dim = Eigen::array<int, R_D>();
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      reduce_dim[i] = dims[i];
-    }
-    // construct the squeezed output tensor
-    bool keep_dim = context.Attr<bool>("keep_dim");
-    DDim out_dims = output->dims();
-    if (keep_dim && x_rank > 1) {
-      const int kDelFlag = -2;
-      auto dims_vector = vectorize(out_dims);
-      for (size_t i = 0; i < dims.size(); ++i) {
-        dims_vector[dims[i]] = kDelFlag;
-      }
-      dims_vector.erase(
-          remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-          dims_vector.end());
-      out_dims = framework::make_ddim(dims_vector);
-    }
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-    if (D == 1) {
-      auto out = EigenScalar<T>::From(*output);
-      functor(place, &x, &out, reduce_dim);
-    } else {
-      auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
-      functor(place, &x, &out, reduce_dim);
-    }
-  }
 };
 template <typename DeviceContext, typename T, typename Functor>
@@ -203,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    bool reduce_all = context.Attr<bool>("reduce_all");
-    if (reduce_all) {
+    auto dims = context.Attr<std::vector<int>>("dim");
    auto* input0 = context.Input<Tensor>("X");
    auto* input1 = context.Input<Tensor>("Out");
    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
    output->mutable_data<T>(context.GetPlace());
+    if (reduce_all) {
      auto x = EigenVector<T>::Flatten(*input0);
      auto x_reduce = EigenVector<T>::From(*input1);
      auto x_reduce_grad = EigenVector<T>::From(*input2);
@@ -221,74 +101,172 @@ class ReduceGradKernel : public framework::OpKernel<T> {
      functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
              broadcast_dim[0]);
    } else {
-      int rank = context.Input<Tensor>("X")->dims().size();
+      int rank = input0->dims().size();
      switch (rank) {
        case 1:
-          ReduceGradCompute<1>(context);
+          ReduceGradFunctor<DeviceContext, T, 1, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
          break;
        case 2:
-          ReduceGradCompute<2>(context);
+          ReduceGradFunctor<DeviceContext, T, 2, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
          break;
        case 3:
-          ReduceGradCompute<3>(context);
+          ReduceGradFunctor<DeviceContext, T, 3, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
          break;
        case 4:
-          ReduceGradCompute<4>(context);
+          ReduceGradFunctor<DeviceContext, T, 4, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
          break;
        case 5:
-          ReduceGradCompute<5>(context);
+          ReduceGradFunctor<DeviceContext, T, 5, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
          break;
        case 6:
-          ReduceGradCompute<6>(context);
+          ReduceGradFunctor<DeviceContext, T, 6, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
          break;
      }
    }
  }
+};
- private:
+class ReduceOp : public framework::OperatorWithKernel {
-  template <size_t D>
+ public:
-  void ReduceGradCompute(const framework::ExecutionContext& context) const {
+  using framework::OperatorWithKernel::OperatorWithKernel;
-    auto* input0 = context.Input<Tensor>("X");
-    auto* input1 = context.Input<Tensor>("Out");
-    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
-    output->mutable_data<T>(context.GetPlace());
+  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto x = EigenTensor<T, D>::From(*input0);
+    PADDLE_ENFORCE(ctx->HasInput("X"),
-    auto x_grad = EigenTensor<T, D>::From(*output);
+                   "Input(X) of ReduceOp should not be null.");
-    auto x_rank = static_cast<int>(x.dimensions().size());
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-    auto dims = context.Attr<std::vector<int>>("dim");
+                   "Output(Out) of ReduceOp should not be null.");
-    auto x_dims = input0->dims();
+    auto x_dims = ctx->GetInputDim("X");
-    auto reduced_dims_v = vectorize(x_dims);
+    auto x_rank = x_dims.size();
-    Eigen::array<int, D> broadcast_dim;
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      PADDLE_ENFORCE_LT(
+          dims[i], x_rank,
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+    if (reduce_all) {
+      if (keep_dim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = vectorize(x_dims);
+      if (keep_dim) {
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = 1;
+        }
+      } else {
+        const int kDelFlag = -2;
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = kDelFlag;
+        }
+        dims_vector.erase(
+            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+            dims_vector.end());
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (dims[0] != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+};
-    int broad_cats_times = 1;
+class ReduceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
    for (size_t i = 0; i < dims.size(); ++i) {
      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      reduced_dims_v[dims[i]] = 1;
+      PADDLE_ENFORCE_LT(
-      broadcast_dim[dims[i]] = x_dims[dims[i]];
+          dims[i], x_rank,
-      broad_cats_times *= x_dims[dims[i]];
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
+    }
  }
-    auto reduced_dims = framework::make_ddim(reduced_dims_v);
+};
-    auto x_reduce = EigenTensor<T, D>::From(*input1, reduced_dims);
-    auto x_reduce_grad = EigenTensor<T, D>::From(*input2, reduced_dims);
-    auto& place =
+class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
-        *context.template device_context<DeviceContext>().eigen_device();
+ public:
+  void Make() final {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 6 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<std::vector<int>>(
+        "dim",
+        "(list<int>, default {0}) The dimensions to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault({0});
+    AddAttr<bool>("keep_dim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+%s Operator.
-    Functor functor;
+This operator computes the %s of input tensor along the given dimension.
-    functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
-            broad_cats_times);
+If reduce_all is true, just reduce along all dimensions and output a scalar.
+)DOC",
+                               GetOpType(), GetName()));
  }
+ protected:
+  virtual std::string GetName() const = 0;
+  virtual std::string GetOpType() const = 0;
 };
 }  // namespace operators
 }  // namespace paddle
-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                \
+namespace ops = paddle::operators;
-  __macro(reduce_sum, SumFunctor, SumGradFunctor);      \
-  __macro(reduce_mean, MeanFunctor, MeanGradFunctor);   \
+#define REGISTER_REDUCE_OP(op_name)                                      \
-  __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \
+  class __##op_name##Maker__ : public ops::ReduceOpMaker {               \
-  __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \
+   protected:                                                            \
-  __macro(reduce_prod, ProdFunctor, ProdGradFunctor);
+    virtual std::string GetName() const { return #op_name; }             \
+    virtual std::string GetOpType() const { return "Reduce " #op_name; } \
+  };                                                                     \
+  REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__,        \
+                    paddle::framework::DefaultGradOpDescMaker<true>);    \
+  REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
--- a/paddle/fluid/operators/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_op_function.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename DeviceContext, typename T, size_t D, size_t R_D,
+          typename Functor>
+void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input,
+                   framework::Tensor* output, const std::vector<int>& dims,
+                   bool keep_dim) {
+  auto x = EigenTensor<T, D>::From(input);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto reduce_dim = Eigen::array<int, R_D>();
+  std::vector<int> dims_ref = dims;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
+    reduce_dim[i] = dims_ref[i];
+  }
+  // construct the squeezed output tensor
+  DDim out_dims = output->dims();
+  if (keep_dim && x_rank > 1) {
+    const int kDelFlag = -2;
+    auto dims_vector = framework::vectorize(out_dims);
+    for (size_t i = 0; i < dims_ref.size(); ++i) {
+      dims_vector[dims_ref[i]] = kDelFlag;
+    }
+    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+                      dims_vector.end());
+    out_dims = framework::make_ddim(dims_vector);
+  }
+  auto& place = *context.eigen_device();
+  Functor functor;
+  if (D == 1) {
+    auto out = EigenScalar<T>::From(*output);
+    functor(place, &x, &out, reduce_dim);
+  } else {
+    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
+    functor(place, &x, &out, reduce_dim);
+  }
+}
+template <typename DeviceContext, typename T, size_t D, typename Functor>
+void ReduceGradFunctor(const DeviceContext& context,
+                       const framework::Tensor& input0,
+                       const framework::Tensor& input1,
+                       const framework::Tensor& input2,
+                       framework::Tensor* output,
+                       const std::vector<int>& dims) {
+  auto x = EigenTensor<T, D>::From(input0);
+  auto x_grad = EigenTensor<T, D>::From(*output);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto x_dims = input0.dims();
+  auto reduced_dims_v = framework::vectorize(x_dims);
+  std::vector<int> dims_ref = dims;
+  Eigen::array<int, D> broadcast_dim;
+  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+  int broad_cats_times = 1;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) {
+      dims_ref[i] = x_rank + dims_ref[i];
+    }
+    reduced_dims_v[dims_ref[i]] = 1;
+    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
+    broad_cats_times *= x_dims[dims_ref[i]];
+  }
+  auto reduced_dims = framework::make_ddim(reduced_dims_v);
+  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
+  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
+  auto& place = *context.eigen_device();
+  Functor functor;
+  functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
+          broad_cats_times);
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_prod_op.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reduce_prod_op.h"
+REGISTER_REDUCE_OP(reduce_prod);
+REGISTER_OP_CPU_KERNEL(reduce_prod,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int64_t, ops::ProdFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::ProdGradFunctor>);
--- a/paddle/fluid/operators/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_prod_op.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reduce_prod_op.h"
+REGISTER_OP_CUDA_KERNEL(reduce_prod,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::ProdFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::ProdGradFunctor>);
--- a/paddle/fluid/operators/reduce_prod_op.h
+++ b/paddle/fluid/operators/reduce_prod_op.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/operators/reduce_op.h"
+namespace paddle {
+namespace operators {
+struct ProdFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->prod(dim);
+  }
+};
+struct ProdGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_sum_op.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reduce_sum_op.h"
+REGISTER_REDUCE_OP(reduce_sum);
+REGISTER_OP_CPU_KERNEL(
+    reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::SumFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::SumGradFunctor>);
--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reduce_sum_op.h"
+REGISTER_OP_CUDA_KERNEL(reduce_sum,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::SumFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::SumGradFunctor>);
--- a/paddle/fluid/operators/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_sum_op.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/operators/reduce_op.h"
+namespace paddle {
+namespace operators {
+struct SumFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->sum(dim);
+  }
+};
+struct SumGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reverse_op.cc
+++ b/paddle/fluid/operators/reverse_op.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reverse_op.h"
+#include <vector>
+namespace paddle {
+namespace operators {
+class ReverseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    const auto& x_dims = ctx->GetInputDim("X");
+    const auto& axis = ctx->Attrs().Get<std::vector<int>>("axis");
+    PADDLE_ENFORCE(!axis.empty(), "'axis' can not be empty.");
+    for (int a : axis) {
+      PADDLE_ENFORCE_LT(a, x_dims.size(),
+                        "The axis must be less than input tensor's rank.");
+    }
+    ctx->SetOutputDim("Out", x_dims);
+  }
+};
+class ReverseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The LoDTensor to be flipped.");
+    AddOutput("Out", "The LoDTensor after flipping.");
+    AddAttr<std::vector<int>>(
+        "axis", "The axises that along which order of elements is reversed.");
+    AddComment(R"DOC(
+      Reverse Operator.
+      Reverse the order of elements in the input LoDTensor along given axises.
+      Case 1:
+        Given
+            X = [[1, 2, 3, 4, 5]
+                 [6, 7, 8, 9, 10]
+                 [11, 12, 13, 14, 15]],
+        and
+            axis = [0],
+        we get:
+            Out = [[11, 12, 13, 14, 15]
+                   [6, 7, 8, 9, 10]
+                   [1, 2, 3, 4, 5]].
+      Case 2:
+        Given
+            X = [[[1, 2, 3, 4]
+                  [5, 6, 7, 8]]
+                 [[9, 10, 11, 12]
+                  [13, 14, 15, 16]]],
+        and
+            axis = [0, 2],
+        we get:
+            Out = [[[12, 11, 10, 9]
+                    [16, 15, 14, 13]]
+                   [[4, 3, 2, 1]
+                    [8, 7, 6, 5]]],
+    )DOC");
+  }
+};
+class ReverseGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("reverse");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("axis", GetAttr("axis"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(reverse, ops::ReverseOp, ops::ReverseOpMaker,
+                  ops::ReverseGradMaker);
+REGISTER_OPERATOR(reverse_grad, ops::ReverseOp);
+REGISTER_OP_CPU_KERNEL(
+    reverse, ops::ReverseKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, double>)
--- a/paddle/fluid/operators/reverse_op.cu
+++ b/paddle/fluid/operators/reverse_op.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reverse_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    reverse, ops::ReverseKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, double>)
--- a/paddle/fluid/operators/reverse_op.h
+++ b/paddle/fluid/operators/reverse_op.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T, int Rank>
+struct ReverseFunctor {
+  void operator()(const DeviceContext& context, const framework::LoDTensor& in,
+                  framework::LoDTensor* out, const std::vector<int>& axis) {
+    Eigen::array<bool, Rank> reverse_axis;
+    for (int i = 0; i < Rank; ++i) {
+      reverse_axis[i] = false;
+    }
+    for (int a : axis) {
+      reverse_axis[a] = true;
+    }
+    auto in_eigen = framework::EigenTensor<T, Rank>::From(in);
+    auto out_eigen = framework::EigenTensor<T, Rank>::From(*out);
+    auto* dev = context.eigen_device();
+    out_eigen.device(*dev) = in_eigen.reverse(reverse_axis);
+  }
+};
+template <typename DeviceContext, typename T>
+class ReverseKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+    const auto& axis = context.Attr<std::vector<int>>("axis");
+    int rank = x->dims().size();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    switch (rank) {
+      case 1:
+        ReverseFunctor<DeviceContext, T, 1> functor1;
+        functor1(dev_ctx, *x, out, axis);
+        break;
+      case 2:
+        ReverseFunctor<DeviceContext, T, 2> functor2;
+        functor2(dev_ctx, *x, out, axis);
+        break;
+      case 3:
+        ReverseFunctor<DeviceContext, T, 3> functor3;
+        functor3(dev_ctx, *x, out, axis);
+        break;
+      case 4:
+        ReverseFunctor<DeviceContext, T, 4> functor4;
+        functor4(dev_ctx, *x, out, axis);
+        break;
+      case 5:
+        ReverseFunctor<DeviceContext, T, 5> functor5;
+        functor5(dev_ctx, *x, out, axis);
+        break;
+      case 6:
+        ReverseFunctor<DeviceContext, T, 6> functor6;
+        functor6(dev_ctx, *x, out, axis);
+        break;
+      default:
+        PADDLE_THROW(
+            "Reserve operator doesn't supports tensors whose ranks are greater "
+            "than 6.");
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -49,6 +49,9 @@ class SoftmaxOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    // choose cudnn kernel if the runtime supported.
    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_CUDA
    if (platform::CanCUDNNBeUsed(ctx)) {
      library_ = framework::LibraryType::kCUDNN;
@@ -58,6 +61,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
    if (library_ == framework::LibraryType::kPlain &&
        platform::CanMKLDNNBeUsed(ctx)) {
      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
    }
 #endif
@@ -68,9 +72,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
                     "float16 can only be used on GPU place");
    }
-    std::string data_format = ctx.Attr<std::string>("data_format");
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
-    return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                   framework::StringToDataLayout(data_format),
                                   library_);
  }
 };
@@ -142,6 +144,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    // choose cudnn kernel if the runtime supported.
    framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
    if (platform::CanCUDNNBeUsed(ctx)) {
      library_ = framework::LibraryType::kCUDNN;

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <mkldnn.h>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace platform {
@@ -86,5 +87,17 @@ inline mkldnn::memory::data_type MKLDNNGetDataType<float>() {
  return mkldnn::memory::f32;
 }
+inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) {
+  auto reorder_prim = mkldnn::reorder(src, dst);
+  std::vector<mkldnn::primitive> pipeline;
+  pipeline.push_back(reorder_prim);
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) {
+  return static_cast<mkldnn::memory::format>(
+      memory.get_primitive_desc().desc().data.format);
+}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -447,7 +447,7 @@ EOF
    # run paddle version to install python packages first
    RUN apt-get update &&\
        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip python-opencv dmidecode python-tk && easy_install -U pip && \
+        apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
        pip install /*.whl; apt-get install -f -y && \
        apt-get clean -y && \
        rm -f /*.whl && \

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -170,6 +170,8 @@ def get_program_cache_key(feed, fetch_list):
            return var.desc.name()
        elif isinstance(var, str):
            return var
+        elif isinstance(var, basestring):
+            return str(var)
        else:
            raise TypeError(str(var) + " should be Variable or str")

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -72,6 +72,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
        return core.VarDesc.VarType.INT64
    elif dtype == np.bool:
        return core.VarDesc.VarType.BOOL
+    elif dtype == np.uint16:
+        return core.VarDesc.VarType.INT16
    elif dtype == np.uint8:
        return core.VarDesc.VarType.UINT8
    else:
@@ -368,6 +370,13 @@ class Operator(object):
    Block. Users can use the build in instructions to describe their neural
    network.
    """
+    OP_WITHOUT_KERNEL_SET = {
+        'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
+        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
+        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
+        'ncclInit', 'channel_create', 'channel_close', 'channel_send',
+        'channel_recv', 'select'
+    }
    def __init__(self,
                 block,
@@ -504,17 +513,13 @@ class Operator(object):
                else:
                    self.desc.set_attr(attr_name, self.attrs[attr_name])
        self.desc.check_attrs()
-        no_kernel_op_set = {
+        if self.has_kernel(type):
-            'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
-            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
-            'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
-            'load_combine', 'ncclInit', 'channel_create', 'channel_close',
-            'channel_send', 'channel_recv', 'select', 'gen_nccl_id'
-        }
-        if type not in no_kernel_op_set:
            self.desc.infer_var_type(self.block.desc)
            self.desc.infer_shape(self.block.desc)
+    def has_kernel(self, op_type):
+        return op_type not in self.OP_WITHOUT_KERNEL_SET
    def to_string(self, throw_on_error):
        """
        To debug string.
@@ -742,7 +747,9 @@ class Block(object):
    def var(self, name):
        if not isinstance(name, basestring):
-            raise TypeError()
+            raise TypeError(
+                "var require string as parameter, but get %s instead." %
+                (type(name)))
        v = self.vars.get(name, None)
        if v is None:
            raise ValueError("var %s not in this block" % name)

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -434,7 +434,7 @@ def open_files(filenames,
               shapes,
               lod_levels,
               dtypes,
-               thread_num,
+               thread_num=1,
               buffer_size=None,
               pass_num=1,
               for_parallel=True):

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -363,6 +363,40 @@ def zeros(shape, dtype, force_cpu=False):
    return fill_constant(value=0.0, **locals())
+def reverse(x, axis):
+    """
+    **reverse**
+    This function reverse the input 'x' along given axises.
+    Args:
+        x(Vairbale): the input to be reversed.
+        axis(int|tuple|list): Axis that along which order of elements 
+                    is reversed. If it is a tuple or a list, reversing 
+                    will be apply on each axis in the tuple or list.  
+    Returns:
+        Variable: The reversed tensor.
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.reverse(x=in, axis=0)
+          # or:
+          out = fluid.layers.reverse(x=in, axis=[0,1])
+    """
+    if isinstance(axis, int):
+        axis = [axis]
+    helper = LayerHelper("reverse", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='reverse',
+        inputs={'Input': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
 def save(x, file_path, overwrite=True):
    """
    Saves a variable as a file.

--- a/python/paddle/fluid/tests/unittests/benchmark.py
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import unittest
+import time
+import itertools
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from op_test import OpTest
+class BenchmarkSuite(OpTest):
+    def timeit_function(self, callback, iters, *args, **kwargs):
+        assert iters != 0, "Iters should >= 1"
+        start = time.time()
+        for i in range(iters):
+            callback(*args, **kwargs)
+        elapse = time.time() - start
+        return elapse / iters
+    def _assert_cpu_gpu_same(self, cpu_outs, gpu_outs, fetch_list, atol):
+        for item_cpu_out, item_gpu_out, variable in zip(cpu_outs, gpu_outs,
+                                                        fetch_list):
+            # the cpu version is baseline, expect gpu version keep same with cpu version.
+            expect = item_cpu_out
+            expect_t = np.array(item_cpu_out)
+            actual = item_gpu_out
+            actual_t = np.array(item_gpu_out)
+            var_name = variable if isinstance(variable,
+                                              basestring) else variable.name
+            self.assertTrue(
+                np.allclose(
+                    actual_t, expect_t, atol=atol),
+                "Output (" + var_name + ") has diff" + str(actual_t) + "\n" +
+                str(expect_t))
+            self.assertListEqual(actual.lod(),
+                                 expect.lod(),
+                                 "Output (" + var_name + ") has different lod")
+    def _get_input_names(self):
+        inputs = []
+        for name, value in self.inputs.iteritems():
+            if isinstance(value, list):
+                inputs.extend([sub_name for sub_name, _ in value])
+            inputs.append(name)
+        return inputs
+    def _get_output_names(self):
+        outputs = []
+        for var_name, var in self.outputs.iteritems():
+            if isinstance(var, list):
+                for sub_var_name, sub_var in var:
+                    outputs.append(sub_var_name)
+            else:
+                outputs.append(var_name)
+        if len(outputs) == 0:
+            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+                outputs.append(str(out_name))
+        return outputs
+    def check_output_stability(self, atol=1e-8):
+        places = self._get_places()
+        if len(places) < 2:
+            return
+        cpu_outs, fetch_list = self._calc_output(places[0])
+        gpu_outs, _ = self._calc_output(places[1])
+        self._assert_cpu_gpu_same(cpu_outs, gpu_outs, fetch_list, atol)
+    def timeit_output_with_place(self, place, iters):
+        return self.timeit_function(self.calc_output, iters, place)
+    def timeit_output(self, iters=100):
+        places = self._get_places()
+        elapses = []
+        for place in places:
+            elapses.append(self.timeit_output_with_place(place, iters))
+        for place, elapse in zip(places, elapses):
+            print("One pass of ({2}_op) at {0} cost {1}".format(
+                str(place), elapse, self.op_type))
+    def timeit_grad_with_place(self, place, iters=100):
+        inputs_to_check = self._get_input_names()
+        output_names = self._get_output_names()
+        return self.timeit_function(
+            self._get_gradient,
+            iters,
+            inputs_to_check,
+            place,
+            output_names,
+            no_grad_set=None)
+    def timeit_grad(self, iters=100):
+        places = self._get_places()
+        elapses = []
+        for place in places:
+            elapses.append(self.timeit_grad_with_place(place, iters))
+        for place, elapse in zip(places, elapses):
+            print("One pass of ({2}_grad_op) at {0} cost {1}".format(
+                str(place), elapse, self.op_type))
--- a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+from benchmark import BenchmarkSuite
+from op_test import OpTest
+# This is a demo op test case for operator benchmarking and high resolution number stability alignment.
+class TestSumOp(BenchmarkSuite):
+    def setUp(self):
+        self.op_type = "sum"
+        self.customize_testcase()
+        self.customize_fetch_list()
+    def customize_fetch_list(self):
+        """
+        customize fetch list, configure the wanted variables.
+        >>> self.fetch_list = ["Out"]
+        """
+        self.fetch_list = ["Out"]
+        # pass
+    def customize_testcase(self):
+        # a test case
+        x0 = np.random.random((300, 400)).astype('float32')
+        x1 = np.random.random((300, 400)).astype('float32')
+        x2 = np.random.random((300, 400)).astype('float32')
+        # NOTE: if the output is empty, then it will autofilled by benchmarkSuite.
+        # only the output dtype is used, the shape, lod and data is computed from input.
+        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+        self.outputs = {"Out": x0 + x1 + x2}
+    def test_check_output(self):
+        """
+        compare the output with customized output. In this case,
+        you should set the correct output by hands.
+        >>> self.outputs = {"Out": x0 + x1 + x2}
+        """
+        self.check_output(atol=1e-8)
+    def test_output_stability(self):
+        # compare the cpu gpu output in high resolution.
+        self.check_output_stability()
+    def test_timeit_output(self):
+        """
+        perf the op, time cost will be averged in iters.
+        output example
+        >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
+        >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
+        """
+        self.timeit_output(iters=100)
+    def test_timeit_grad(self):
+        """
+        perf the op gradient, time cost will be averged in iters.
+        output example
+        >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
+        >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653
+        """
+        self.timeit_grad(iters=100)
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -15,13 +15,17 @@
 import unittest
 import numpy as np
 import random
+import time
 import itertools
-import paddle.fluid.core as core
 import collections
+import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid.backward import append_backward
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from paddle.fluid.framework import Program, OpProtoHolder
+from paddle.fluid.framework import Program, OpProtoHolder, Variable
+from testsuite import create_op, set_input, append_input_output, append_loss_ops
 def randomize_probability(batch_size, class_num, dtype='float32'):
@@ -33,73 +37,6 @@ def randomize_probability(batch_size, class_num, dtype='float32'):
    return prob
-def create_op(scope, op_type, inputs, outputs, attrs):
-    kwargs = dict()
-    op_maker = core.op_proto_and_checker_maker
-    op_role_attr_name = op_maker.kOpRoleAttrName()
-    if op_role_attr_name not in attrs:
-        attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
-    def __create_var__(name, var_name):
-        scope.var(var_name).get_tensor()
-        kwargs[name].append(var_name)
-    for in_name, in_dup in Operator.get_op_inputs(op_type):
-        if in_name in inputs:
-            kwargs[in_name] = []
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, _ = item[0], item[1]
-                    __create_var__(in_name, sub_in_name)
-            else:
-                __create_var__(in_name, in_name)
-    for out_name, out_dup in Operator.get_op_outputs(op_type):
-        if out_name in outputs:
-            kwargs[out_name] = []
-            if out_dup:
-                sub_out = outputs[out_name]
-                for item in sub_out:
-                    sub_out_name, _ = item[0], item[1]
-                    __create_var__(out_name, sub_out_name)
-            else:
-                __create_var__(out_name, out_name)
-    for attr_name in Operator.get_op_attr_names(op_type):
-        if attr_name in attrs:
-            kwargs[attr_name] = attrs[attr_name]
-    return Operator(op_type, **kwargs)
-def set_input(scope, op, inputs, place):
-    def __set_input__(var_name, var):
-        if isinstance(var, tuple) or isinstance(var, np.ndarray):
-            tensor = scope.find_var(var_name).get_tensor()
-            if isinstance(var, tuple):
-                tensor.set_lod(var[1])
-                var = var[0]
-            tensor.set_dims(var.shape)
-            tensor.set(var, place)
-        elif isinstance(var, float):
-            scope.find_var(var_name).set_float(var)
-        elif isinstance(var, int):
-            scope.find_var(var_name).set_int(var)
-    for in_name, in_dup in Operator.get_op_inputs(op.type()):
-        if in_name in inputs:
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, sub_in_val = item[0], item[1]
-                    __set_input__(sub_in_name, sub_in_val)
-            else:
-                __set_input__(in_name, inputs[in_name])
 def get_numeric_gradient(place,
                         scope,
                         op,
@@ -173,54 +110,15 @@ def get_numeric_gradient(place,
    return gradient_flat.reshape(tensor_to_check.get_dims())
-def append_input_output(block, op_proto, np_list, is_input):
-    '''Insert VarDesc and generate Python variable instance'''
-    proto_list = op_proto.inputs if is_input else op_proto.outputs
-    def create_var(block, name, np_list, var_proto):
-        if name not in np_list:
-            assert var_proto.intermediate, "{} not found".format(name)
-            shape = None
-            lod_level = None
-        else:
-            np_value = np_list[name]
-            if isinstance(np_value, tuple):
-                shape = list(np_value[0].shape)
-                lod_level = len(np_value[1])
-            else:
-                shape = list(np_value.shape)
-                lod_level = 0
-        return block.create_var(
-            dtype="float32", shape=shape, lod_level=lod_level, name=name)
-    var_dict = {}
-    for var_proto in proto_list:
-        var_name = str(var_proto.name)
-        if is_input:
-            if (var_name not in np_list) and var_proto.dispensable:
-                continue
-            assert (var_name in np_list) or (var_proto.dispensable), \
-                "Missing {} as input".format(var_name)
-        if var_proto.duplicable:
-            assert isinstance(np_list[var_name], list), \
-                "Duplicable {} should be set as list".format(var_name)
-            var_list = []
-            for (name, np_value) in np_list[var_name]:
-                var_list.append(
-                    create_var(block, name, {name: np_value}, var_proto))
-            var_dict[var_name] = var_list
-        else:
-            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
-    return var_dict
 class OpTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        '''Fix random seeds to remove randomness from tests'''
        cls._np_rand_state = np.random.get_state()
        cls._py_rand_state = random.getstate()
+        cls.call_once = False
+        cls.dtype = "float32"
+        cls.outputs = {}
        np.random.seed(123)
        random.seed(124)
@@ -231,6 +129,31 @@ class OpTest(unittest.TestCase):
        np.random.set_state(cls._np_rand_state)
        random.setstate(cls._py_rand_state)
+    def try_call_once(self, data_type):
+        if not self.call_once:
+            self.call_once = True
+            self.dtype = data_type
+    def infer_dtype_from_inputs_outputs(self, inputs, outputs):
+        def infer_dtype(numpy_dict):
+            assert isinstance(
+                numpy_dict,
+                dict), "self.inputs, self.outputs must be numpy_dict"
+            for var_name, var_value in numpy_dict.iteritems():
+                if isinstance(var_value, (np.ndarray, np.generic)):
+                    self.try_call_once(var_value.dtype)
+                elif isinstance(var_value, (list, tuple)):
+                    # the case of self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+                    if len(var_value) > 1 and isinstance(var_value[1], (
+                            np.ndarray, np.generic)):
+                        instance = var_value[1]
+                        self.try_call_once(instance[1].dtype)
+                else:
+                    self.try_call_once("float32")
+        infer_dtype(inputs)
+        infer_dtype(outputs)
    def feed_var(self, input_vars, place):
        feed_map = {}
        for var_name in input_vars:
@@ -254,18 +177,14 @@ class OpTest(unittest.TestCase):
        return feed_map
-    def calc_output(self, place):
+    def _append_ops(self, block):
-        outs, _ = self._calc_output(place)
-        return outs
-    def _calc_output(self, place):
        op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
+        "infer datatype from inputs and outputs for this test case"
-        program = Program()
+        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
-        block = program.global_block()
+        inputs = append_input_output(block, op_proto, self.inputs, True,
+                                     self.dtype)
-        inputs = append_input_output(block, op_proto, self.inputs, True)
+        outputs = append_input_output(block, op_proto, self.outputs, False,
-        outputs = append_input_output(block, op_proto, self.outputs, False)
+                                      self.dtype)
        op = block.append_op(
            type=self.op_type,
            inputs=inputs,
@@ -275,19 +194,65 @@ class OpTest(unittest.TestCase):
        op.desc.infer_var_type(block.desc)
        op.desc.infer_shape(block.desc)
-        fetch_list = []
+    def _get_io_vars(self, block, numpy_inputs):
+        inputs = {}
+        for name, value in numpy_inputs.iteritems():
+            if isinstance(value, list):
+                var_list = [
+                    block.var(sub_name) for sub_name, sub_value in value
+                ]
+                inputs[name] = var_list
+            else:
+                inputs[name] = block.var(name)
+        return inputs
+    def _get_inputs(self, block):
+        return self._get_io_vars(block, self.inputs)
+    def _get_outputs(self, block):
+        return self._get_io_vars(block, self.outputs)
+    def calc_output(self, place):
+        outs, _ = self._calc_output(place)
+        return outs
+    def _calc_output(self, place, parallel=False):
+        program = Program()
+        block = program.global_block()
+        self._append_ops(block)
+        inputs = self._get_inputs(block)
+        outputs = self._get_outputs(block)
+        feed_map = self.feed_var(inputs, place)
+        if parallel:
+            use_cuda = False
+            if isinstance(place, fluid.CUDAPlace(0)):
+                use_cuda = True
+            executor = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+        else:
+            executor = Executor(place)
+        fetch_list = getattr(self, "fetch_list", [])
+        # if the fetch_list is customized by user, we use it directly.
+        # if not, fill the fetch_list by the user configured outputs in test.
+        if len(fetch_list) == 0:
            for var_name, var in outputs.iteritems():
-            if var_name in self.outputs:
                if isinstance(var, list):
                    for v in var:
                        fetch_list.append(v)
                else:
                    fetch_list.append(var)
+        # if the fetch_list still empty, fill the fetch_list by the operator output.
-        feed_map = self.feed_var(inputs, place)
+        if len(fetch_list) == 0:
+            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
-        exe = Executor(place)
+                fetch_list.append(str(out_name))
-        outs = exe.run(program,
+        # fetch_list = map(block.var, fetch_list)
+        if not isinstance(fetch_list[0], Variable):
+            fetch_list = map(block.var, fetch_list)
+        outs = executor.run(program,
                            feed=feed_map,
                            fetch_list=fetch_list,
                            return_numpy=False)
@@ -346,17 +311,19 @@ class OpTest(unittest.TestCase):
                                         "Output (" + out_name +
                                         ") has different lod at " + str(place))
-    def check_output(self, atol=1e-5):
+    def _get_places(self):
-        places = [core.CPUPlace()]
+        places = [fluid.CPUPlace()]
        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
            places.append(core.CUDAPlace(0))
+        return places
+    def check_output(self, atol=1e-5):
+        places = self._get_places()
        for place in places:
            self.check_output_with_place(place, atol)
    def check_output_customized(self, checker):
-        places = [core.CPUPlace()]
+        places = self._get_places()
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
-            places.append(core.CUDAPlace(0))
        for place in places:
            outs = self.calc_output(place)
            outs = [np.array(out) for out in outs]
@@ -389,9 +356,7 @@ class OpTest(unittest.TestCase):
                   in_place=False,
                   max_relative_error=0.005,
                   user_defined_grads=None):
-        places = [core.CPUPlace()]
+        places = self._get_places()
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
-            places.append(core.CUDAPlace(0))
        for place in places:
            self.check_grad_with_place(place, inputs_to_check, output_names,
                                       no_grad_set, numeric_grad_delta,
@@ -438,35 +403,6 @@ class OpTest(unittest.TestCase):
                               max_relative_error,
                               "Gradient Check On %s" % str(place))
-    @staticmethod
-    def _create_var_descs_(block, var_dict):
-        # FIXME: Try unify with `append_input_output`
-        for param_name in var_dict:
-            var = var_dict[param_name]
-            if not isinstance(var, list) and not isinstance(var, tuple):
-                var = [(param_name, var, None)]
-            if not isinstance(var[0], list) and not isinstance(var[0], tuple):
-                var = [(param_name, var[0], var[1])]
-            for i, item in enumerate(var):
-                if not isinstance(item[0], basestring):
-                    item = [[param_name] + list(item)]
-                if len(item) == 2:
-                    if isinstance(item[1], tuple):
-                        var[i] = [item[0], item[1][0], item[1][1]]
-                    else:
-                        # only set var name and value, set lod to None
-                        var[i] = list(item) + [None]
-            var_descs = [(block.create_var(
-                name=name, shape=each.shape, dtype=each.dtype), each, lod)
-                         for name, each, lod in var]
-            yield param_name, var_descs
-    @staticmethod
-    def _merge_list(iterable):
-        return reduce(lambda a, b: list(a) + list(b), iterable, [])
    @staticmethod
    def _numpy_to_lod_tensor(np_value, lod, place):
        tensor = core.LoDTensor()
@@ -497,82 +433,30 @@ class OpTest(unittest.TestCase):
            input.dtype = np.uint16
        return input
-    def _get_gradient(self, input_to_check, place, output_names, no_grad_set):
+    def _get_gradient(self,
+                      input_to_check,
+                      place,
+                      output_names,
+                      no_grad_set,
+                      parallel=False):
        prog = Program()
        block = prog.global_block()
-        inputs_with_np = {
+        self._append_ops(block)
-            key: value
+        loss = append_loss_ops(block, output_names)
-            for (key, value) in OpTest._create_var_descs_(
-                block, getattr(self, 'inputs', {}))
-        }
-        outputs_with_np = {
-            key: val
-            for (key, val) in OpTest._create_var_descs_(
-                block, getattr(self, 'outputs', {}))
-        }
-        inputs = {
-            k: [item[0] for item in inputs_with_np[k]]
-            for k in inputs_with_np
-        }
-        outputs = {
-            k: [item[0] for item in outputs_with_np[k]]
-            for k in outputs_with_np
-        }
-        op = block.append_op(
-            type=self.op_type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=getattr(self, 'attrs', {}))
-        # infer variable type and infer shape in compile-time
-        op.desc.infer_var_type(block.desc)
-        op.desc.infer_shape(block.desc)
-        mean_inputs = map(block.var, output_names)
-        if len(mean_inputs) == 1:
-            loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
-            op = block.append_op(
-                inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
-            op.desc.infer_var_type(block.desc)
-            op.desc.infer_shape(block.desc)
-        else:
-            avg_sum = []
-            for cur_loss in mean_inputs:
-                cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
-                op = block.append_op(
-                    inputs={"X": [cur_loss]},
-                    outputs={"Out": [cur_avg_loss]},
-                    type="mean")
-                op.desc.infer_var_type(block.desc)
-                op.desc.infer_shape(block.desc)
-                avg_sum.append(cur_avg_loss)
-            loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
-            op_sum = block.append_op(
-                inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
-            op_sum.desc.infer_var_type(block.desc)
-            op_sum.desc.infer_shape(block.desc)
-            loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
-            op_loss = block.append_op(
-                inputs={"X": loss_sum},
-                outputs={"Out": loss},
-                type='scale',
-                attrs={'scale': 1.0 / float(len(avg_sum))})
-            op_loss.desc.infer_var_type(block.desc)
-            op_loss.desc.infer_shape(block.desc)
        param_grad_list = append_backward(
            loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
-        feed_dict = {
+        inputs = self._get_inputs(block)
-            item[0].name: OpTest._numpy_to_lod_tensor(item[1], item[2], place)
+        feed_dict = self.feed_var(inputs, place)
-            for p_name in inputs_with_np for item in inputs_with_np[p_name]
-        }
        fetch_list = [g for p, g in param_grad_list]
+        if parallel:
+            use_cuda = False
+            if isinstance(place, fluid.CUDAPlace(0)):
+                use_cuda = True
+            executor = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+        else:
            executor = Executor(place)
        return map(np.array,
                   executor.run(prog, feed_dict, fetch_list,

--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -252,5 +252,25 @@ class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
        self.axis = 1
+class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+    def init_axis(self):
+        self.axis = -1
+class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 10, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+    def init_axis(self):
+        self.axis = -1
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
@@ -194,107 +194,104 @@ class TestLstmOp(OpTest):
            ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
-class TestLstmOpHasInitial(TestLstmOp):
+# class TestLstmOpHasInitial(TestLstmOp):
-    def set_argument(self):
+#     def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
+#         self.lod = [[0, 2, 5, 7]]
-        self.D = 16
+#         self.D = 16
-        self.act_gate = 'sigmoid'
+#         self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
+#         self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
+#         self.act_cand = 'tanh'
-        self.has_initial_state = True
+#         self.has_initial_state = True
-        self.is_reverse = True
+#         self.is_reverse = True
-        self.use_peepholes = True
+#         self.use_peepholes = True
-    def test_check_grad(self):
+#     def test_check_grad(self):
-        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+#         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
+#         N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
+#         self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
+#             (N, self.D)).astype('float64')
-        self.check_grad(
+#         self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
+#             ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
-            max_relative_error=5e-4)
+#             max_relative_error=5e-4)
-    def test_check_grad_ingore_bias(self):
+#     def test_check_grad_ingore_bias(self):
-        N = len(self.lod[0]) - 1
+#         N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
+#         self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
+#             (N, self.D)).astype('float64')
-        self.check_grad(
+#         self.check_grad(
-            ['Input', 'Weight'], ['Hidden'],
+#             ['Input', 'Weight'], ['Hidden'],
-            max_relative_error=5e-4,
+#             max_relative_error=5e-4,
-            no_grad_set=set('Bias'))
+#             no_grad_set=set('Bias'))
-    def test_check_grad_ingore_weight(self):
+#     def test_check_grad_ingore_weight(self):
-        N = len(self.lod[0]) - 1
+#         N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
+#         self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
+#             (N, self.D)).astype('float64')
-        self.check_grad(
+#         self.check_grad(
-            ['Input', 'Bias'], ['Hidden'],
+#             ['Input', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
+#             max_relative_error=5e-4,
-            no_grad_set=set('Weight'))
+#             no_grad_set=set('Weight'))
-    def test_check_grad_ingore_input(self):
+#     def test_check_grad_ingore_input(self):
-        N = len(self.lod[0]) - 1
+#         N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
+#         self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
+#             (N, self.D)).astype('float64')
-        self.check_grad(
+#         self.check_grad(
-            ['Weight', 'Bias'], ['Hidden'],
+#             ['Weight', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
+#             max_relative_error=5e-4,
-            no_grad_set=set('Input'))
+#             no_grad_set=set('Input'))
-    def test_check_grad_ingore_h0(self):
+#     def test_check_grad_ingore_h0(self):
-        N = len(self.lod[0]) - 1
+#         N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
+#         self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
+#             (N, self.D)).astype('float64')
-        self.check_grad(
+#         self.check_grad(
-            ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
+#             ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
-            max_relative_error=5e-4,
+#             max_relative_error=5e-4,
-            no_grad_set=set('H0'))
+#             no_grad_set=set('H0'))
-    def test_check_grad_ingore_c0(self):
+#     def test_check_grad_ingore_c0(self):
-        N = len(self.lod[0]) - 1
+#         N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
+#         self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
+#             (N, self.D)).astype('float64')
-        self.check_grad(
+#         self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
+#             ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
-            max_relative_error=5e-4,
+#             max_relative_error=5e-4,
-            no_grad_set=set('C0'))
+#             no_grad_set=set('C0'))
+# class TestLstmOpRerverse(TestLstmOp):
-class TestLstmOpRerverse(TestLstmOp):
+#     def set_argument(self):
-    def set_argument(self):
+#         self.lod = [[0, 2, 5, 7]]
-        self.lod = [[0, 2, 5, 7]]
+#         self.D = 16
-        self.D = 16
+#         self.act_gate = 'sigmoid'
-        self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
-        self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
-        self.act_cand = 'tanh'
+#         self.has_initial_state = False
-        self.has_initial_state = False
+#         self.is_reverse = True
-        self.is_reverse = True
+#         self.use_peepholes = True
-        self.use_peepholes = True
+# class TestLstmOpNotUsePeepholes(TestLstmOp):
+#     def set_argument(self):
-class TestLstmOpNotUsePeepholes(TestLstmOp):
+#         self.lod = [[0, 2, 5, 7]]
-    def set_argument(self):
+#         self.D = 16
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
-        self.act_gate = 'sigmoid'
+#         self.act_cand = 'tanh'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
+#         self.has_initial_state = False
+#         self.is_reverse = True
-        self.has_initial_state = False
+#         self.use_peepholes = False
-        self.is_reverse = True
-        self.use_peepholes = False
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_reverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestReverseOp(OpTest):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [0]
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = "reverse"
+        self.inputs = {"X": self.x}
+        self.attrs = {'axis': self.axis}
+        out = self.x
+        for a in self.axis:
+            out = np.flip(out, axis=a)
+        self.outputs = {'Out': out}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+class TestCase0(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [1]
+class TestCase1(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [0, 1]
+class TestCase2(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4, 5)).astype('float32')
+        self.axis = [0, 2]
+class TestCase3(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4, 5)).astype('float32')
+        self.axis = [1, 2]
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+def as_lodtensor(np_array, lod, place):
+    tensor = core.LoDTensor()
+    tensor.set(np_value, place)
+    if lod is not None:
+        tensor.set_lod(lod)
+    return tensor
+def create_op(scope, op_type, inputs, outputs, attrs):
+    kwargs = dict()
+    op_maker = core.op_proto_and_checker_maker
+    op_role_attr_name = op_maker.kOpRoleAttrName()
+    if op_role_attr_name not in attrs:
+        attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
+    def __create_var__(name, var_name):
+        scope.var(var_name).get_tensor()
+        kwargs[name].append(var_name)
+    for in_name, in_dup in Operator.get_op_inputs(op_type):
+        if in_name in inputs:
+            kwargs[in_name] = []
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, _ = item[0], item[1]
+                    __create_var__(in_name, sub_in_name)
+            else:
+                __create_var__(in_name, in_name)
+    for out_name, out_dup in Operator.get_op_outputs(op_type):
+        if out_name in outputs:
+            kwargs[out_name] = []
+            if out_dup:
+                sub_out = outputs[out_name]
+                for item in sub_out:
+                    sub_out_name, _ = item[0], item[1]
+                    __create_var__(out_name, sub_out_name)
+            else:
+                __create_var__(out_name, out_name)
+    for attr_name in Operator.get_op_attr_names(op_type):
+        if attr_name in attrs:
+            kwargs[attr_name] = attrs[attr_name]
+    return Operator(op_type, **kwargs)
+def set_input(scope, op, inputs, place):
+    def __set_input__(var_name, var):
+        if isinstance(var, tuple) or isinstance(var, np.ndarray):
+            tensor = scope.find_var(var_name).get_tensor()
+            if isinstance(var, tuple):
+                tensor.set_lod(var[1])
+                var = var[0]
+            tensor.set_dims(var.shape)
+            tensor.set(var, place)
+        elif isinstance(var, float):
+            scope.find_var(var_name).set_float(var)
+        elif isinstance(var, int):
+            scope.find_var(var_name).set_int(var)
+    for in_name, in_dup in Operator.get_op_inputs(op.type()):
+        if in_name in inputs:
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, sub_in_val = item[0], item[1]
+                    __set_input__(sub_in_name, sub_in_val)
+            else:
+                __set_input__(in_name, inputs[in_name])
+def append_input_output(block, op_proto, np_list, is_input, dtype):
+    '''Insert VarDesc and generate Python variable instance'''
+    proto_list = op_proto.inputs if is_input else op_proto.outputs
+    def create_var(block, name, np_list, var_proto):
+        dtype = None
+        shape = None
+        lod_level = None
+        if name not in np_list:
+            assert var_proto.intermediate, "{} not found".format(name)
+        else:
+            np_value = np_list[name]
+            if isinstance(np_value, tuple):
+                dtype = np_value[0].dtype
+                # output shape, lod should be infered from input.
+                if is_input:
+                    shape = list(np_value[0].shape)
+                    lod_level = len(np_value[1])
+            else:
+                dtype = np_value.dtype
+                if is_input:
+                    shape = list(np_value.shape)
+                    lod_level = 0
+        return block.create_var(
+            dtype=dtype, shape=shape, lod_level=lod_level, name=name)
+    var_dict = {}
+    for var_proto in proto_list:
+        var_name = str(var_proto.name)
+        if is_input:
+            if (var_name not in np_list) and var_proto.dispensable:
+                continue
+            assert (var_name in np_list) or (var_proto.dispensable), \
+                "Missing {} as input".format(var_name)
+        if var_proto.duplicable:
+            assert isinstance(np_list[var_name], list), \
+                "Duplicable {} should be set as list".format(var_name)
+            var_list = []
+            for (name, np_value) in np_list[var_name]:
+                var_list.append(
+                    create_var(block, name, {name: np_value}, var_proto))
+            var_dict[var_name] = var_list
+        else:
+            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
+    return var_dict
+def append_loss_ops(block, output_names):
+    mean_inputs = map(block.var, output_names)
+    # for item in mean_inputs:
+    #     print(item)
+    #     print("Item", item.dtype)
+    if len(mean_inputs) == 1:
+        loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
+        op = block.append_op(
+            inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+    else:
+        avg_sum = []
+        for cur_loss in mean_inputs:
+            cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
+            op = block.append_op(
+                inputs={"X": [cur_loss]},
+                outputs={"Out": [cur_avg_loss]},
+                type="mean")
+            op.desc.infer_var_type(block.desc)
+            op.desc.infer_shape(block.desc)
+            avg_sum.append(cur_avg_loss)
+        loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
+        op_sum = block.append_op(
+            inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
+        op_sum.desc.infer_var_type(block.desc)
+        op_sum.desc.infer_shape(block.desc)
+        loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
+        op_loss = block.append_op(
+            inputs={"X": loss_sum},
+            outputs={"Out": loss},
+            type='scale',
+            attrs={'scale': 1.0 / float(len(avg_sum))})
+        op_loss.desc.infer_var_type(block.desc)
+        op_loss.desc.infer_shape(block.desc)
+    return loss