Merge branch 'develop' into core_inference_prepare

bf485999 · Liu Yiqun · 8a2667cd · f605f647 · bf485999 · bf485999
186 changed file
--- a/benchmark/fluid/machine_translation.py
+++ b/benchmark/fluid/machine_translation.py
@@ -48,6 +48,13 @@ parser.add_argument(
    type=int,
    default=16,
    help="The sequence number of a mini-batch data. (default: %(default)d)")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
 parser.add_argument(
    "--dict_size",
    type=int,
@@ -72,16 +79,21 @@ parser.add_argument(
    default=3,
    help="The width for beam searching. (default: %(default)d)")
 parser.add_argument(
-    "--use_gpu",
+    '--device',
-    type=distutils.util.strtobool,
+    type=str,
-    default=True,
+    default='GPU',
-    help="Whether to use gpu. (default: %(default)d)")
+    choices=['CPU', 'GPU'],
+    help="The device type.")
 parser.add_argument(
    "--max_length",
    type=int,
    default=250,
    help="The maximum length of sequence when doing generation. "
    "(default: %(default)d)")
+parser.add_argument(
+    '--with_test',
+    action='store_true',
+    help='If set, test the testset during training.')
 def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
@@ -281,7 +293,7 @@ def train():
            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
        batch_size=args.batch_size)
-    place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = Executor(place)
    exe.run(framework.default_startup_program())
@@ -307,14 +319,20 @@ def train():
        return total_loss / count
+    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in xrange(args.pass_num):
-        pass_start_time = time.time()
+        train_accs = []
-        words_seen = 0
+        train_losses = []
        for batch_id, data in enumerate(train_batch_generator()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
            src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
-            words_seen += word_num
+            num_samples += word_num
            trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
-            words_seen += word_num
+            num_samples += word_num
            lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
            fetch_outs = exe.run(framework.default_main_program(),
@@ -325,24 +343,36 @@ def train():
                                 },
                                 fetch_list=[avg_cost])
-            avg_cost_val = np.array(fetch_outs[0])
+            iters += 1
-            print('pass_id=%d, batch_id=%d, train_loss: %f' %
+            loss = np.array(fetch_outs[0])
-                  (pass_id, batch_id, avg_cost_val))
+            print(
+                "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
-        pass_end_time = time.time()
+        train_elapsed = time.time() - start_time
-        test_loss = do_validation()
+        examples_per_sec = num_samples / train_elapsed
-        time_consumed = pass_end_time - pass_start_time
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-        words_per_sec = words_seen / time_consumed
+              (num_samples, train_elapsed, examples_per_sec))
-        print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
+        # evaluation
-              (pass_id, test_loss, words_per_sec, time_consumed))
+        if args.with_test:
+            test_loss = do_validation()
+        exit(0)
 def infer():
    pass
+def print_arguments(args):
+    print('----------- seq2seq Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
 if __name__ == '__main__':
    args = parser.parse_args()
+    print_arguments(args)
    if args.infer_only:
        infer()
    else:

--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -35,6 +35,12 @@ def parse_args():
    parser = argparse.ArgumentParser("mnist model benchmark.")
    parser.add_argument(
        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
    parser.add_argument(
        '--iterations', type=int, default=35, help='The number of minibatches.')
    parser.add_argument(
@@ -53,19 +59,14 @@ def parse_args():
        '--use_nvprof',
        action='store_true',
        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
    args = parser.parse_args()
    return args
-def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
 def cnn_model(data):
    conv_pool_1 = fluid.nets.simple_img_conv_pool(
        input=data,
@@ -138,9 +139,6 @@ def run_benchmark(model, args):
    # inference program
    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
    # Optimization
    opt = fluid.optimizer.AdamOptimizer(
@@ -160,39 +158,60 @@ def run_benchmark(model, args):
    train_reader = paddle.batch(
        paddle.dataset.mnist.train(), batch_size=args.batch_size)
-    accuracy = fluid.average.WeightedAverage()
+    accuracy = fluid.metrics.Accuracy()
+    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        accuracy.reset()
-        pass_start = time.time()
+        train_accs = []
+        train_losses = []
        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
            img_data = np.array(
                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([len(y_data), 1])
-            start = time.time()
            outs = exe.run(
                fluid.default_main_program(),
                feed={"pixel": img_data,
                      "label": y_data},
                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
            )  # The accuracy is the accumulation of batches, but not the current batch.
-            accuracy.add(value=outs[1], weight=outs[2])
+            accuracy.update(value=outs[1], weight=outs[2])
-            end = time.time()
+            iters += 1
+            num_samples += len(y_data)
            loss = np.array(outs[0])
            acc = np.array(outs[1])
-            print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
+            train_losses.append(loss)
-                  (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+            train_accs.append(acc)
+            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
+                  (pass_id, iters, loss, acc))
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
-        pass_end = time.time()
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        if args.with_test:
+            test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
+                                     inference_program)
+        exit(0)
-        train_avg_acc = accuracy.eval()
-        test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
-                                 inference_program)
-        print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
+def print_arguments(args):
-              (pass_id, train_avg_acc, test_avg_acc,
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-               (pass_end - pass_start) / 1000))
+                                vars(args)['device'] == 'GPU')
+    print('----------- mnist Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
 if __name__ == '__main__':

--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
@@ -87,15 +87,6 @@ def parse_args():
    return args
-def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
 def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
    conv1 = fluid.layers.conv2d(
        input=input,
@@ -279,32 +270,31 @@ def run_benchmark(model, args):
                      'label': label},
                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
            iters += 1
-            num_samples += label[0]
+            num_samples += len(label)
            accuracy.add(value=acc, weight=weight)
            train_losses.append(loss)
            train_accs.append(acc)
            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
                  (pass_id, iters, loss, acc))
-        pass_train_acc = accuracy.eval()
-        # evaluation
-        if args.with_test:
-            pass_test_acc = test(exe)
-        train_elapsed = time.time() - start_time
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        exit(0)
-    if args.use_cprof:
-        pr.disable()
+def print_arguments(args):
-        s = StringIO.StringIO()
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-        sortby = 'cumulative'
+                                vars(args)['device'] == 'GPU')
-        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
+    print('----------- resnet Configuration Arguments -----------')
-        ps.print_stats()
+    for arg, value in sorted(vars(args).iteritems()):
-        print(s.getvalue())
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
 if __name__ == '__main__':

--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
 #!/bin/bash
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.
-export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
+#export FLAGS_fraction_of_gpu_memory_to_use=0.0
+export CUDNN_PATH=/paddle/cudnn_v5
 # disable openmp and mkl parallel
 #https://github.com/PaddlePaddle/Paddle/issues/7199
@@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0
 export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
+# only query the gpu used
+nohup stdbuf -oL nvidia-smi \
+      --id=${CUDA_VISIBLE_DEVICES} \
+      --query-gpu=timestamp \
+      --query-compute-apps=pid,process_name,used_memory \
+      --format=csv \
+      --filename=mem.log  \
+      -l 1 &
+# mnist
+# mnist gpu mnist 128
+FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=500 \
+               2>&1 | tee -a mnist_gpu_128.log
 # vgg16
-# cifar10 gpu cifar10 128
+# gpu cifar10 128
-FLAGS_benchmark=true python fluid/vgg.py \
+FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
-               --iterations=30  \
+               --iterations=30 \
-               2>&1 > vgg16_gpu_128.log
+               2>&1 | tee -a vgg16_gpu_128.log
+# flowers gpu  128
+FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+               --device=GPU \
+               --batch_size=32 \
+               --data_set=flowers \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a vgg16_gpu_flowers_32.log
 # resnet50
 # resnet50 gpu cifar10 128
-FLAGS_benchmark=true python fluid/resnet.py \
+FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
               --device=GPU \
               --batch_size=128 \
               --data_set=cifar10 \
               --model=resnet_cifar10 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 > resnet50_gpu_128.log
+               2>&1 | tee -a resnet50_gpu_128.log
+# resnet50 gpu flowers 64
+FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+               --device=GPU \
+               --batch_size=64 \
+               --data_set=flowers \
+               --model=resnet_imagenet \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a resnet50_gpu_flowers_64.log
 # lstm
+# lstm gpu imdb 32 # tensorflow only support batch=32
+FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+               --device=GPU \
+               --batch_size=32 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               --hidden_dim=512 \
+               --emb_dim=512 \
+               --crop_size=1500 \
+               2>&1 | tee -a lstm_gpu_32.log
+# seq2seq
+# seq2seq gpu wmb 128
+FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a lstm_gpu_128.log
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@@ -37,6 +37,14 @@ def parse_args():
        type=int,
        default=32,
        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
    parser.add_argument(
        '--emb_dim',
        type=int,
@@ -64,6 +72,10 @@ def parse_args():
        default=int(os.environ.get('CROP_SIZE', '1500')),
        help='The max sentence length of input. Since this model use plain RNN,'
        ' Gradient could be explored if sentence is too long')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
    args = parser.parse_args()
    return args
@@ -157,37 +169,43 @@ def main():
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
-    def train_loop(pass_num, crop_size):
+    train_reader = batch(
-        with profiler.profiler(args.device, 'total') as prof:
+        paddle.reader.shuffle(
-            for pass_id in range(pass_num):
+            crop_sentence(imdb.train(word_dict), args.crop_size),
-                train_reader = batch(
+            buf_size=25000),
-                    paddle.reader.shuffle(
+        batch_size=args.batch_size)
-                        crop_sentence(imdb.train(word_dict), crop_size),
-                        buf_size=25000),
+    iters, num_samples, start_time = 0, 0, time.time()
-                    batch_size=args.batch_size)
+    for pass_id in range(args.pass_num):
-                word_nums = 0
+        train_accs = []
-                pass_start_time = time.time()
+        train_losses = []
-                for batch_id, data in enumerate(train_reader()):
+        for batch_id, data in enumerate(train_reader()):
-                    tensor_words = to_lodtensor([x[0] for x in data], place)
+            if iters == args.skip_batch_num:
-                    for x in data:
+                start_time = time.time()
-                        word_nums += len(x[0])
+                num_samples = 0
-                    label = numpy.array([x[1] for x in data]).astype("int64")
+            if iters == args.iterations:
-                    label = label.reshape((-1, 1))
+                break
-                    loss_np, acc, weight = exe.run(
+            tensor_words = to_lodtensor([x[0] for x in data], place)
-                        fluid.default_main_program(),
+            label = numpy.array([x[1] for x in data]).astype("int64")
-                        feed={"words": tensor_words,
+            label = label.reshape((-1, 1))
-                              "label": label},
+            loss_np, acc, weight = exe.run(
-                        fetch_list=[loss, batch_acc, batch_size_tensor])
+                fluid.default_main_program(),
-                    print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" %
+                feed={"words": tensor_words,
-                          (pass_id, batch_id, loss_np, acc))
+                      "label": label},
+                fetch_list=[loss, batch_acc, batch_size_tensor])
-                pass_end_time = time.time()
+            iters += 1
-                time_consumed = pass_end_time - pass_start_time
+            for x in data:
-                words_per_sec = word_nums / time_consumed
+                num_samples += len(x[0])
-                print("pass_id=%d, sec/pass: %f, words/s: %f" %
+            print(
-                      (pass_id, time_consumed, words_per_sec))
+                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                (pass_id, iters, loss_np, acc)
-    train_loop(args.pass_num, args.crop_size)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        exit(0)
 def to_lodtensor(data, place):
@@ -205,5 +223,14 @@ def to_lodtensor(data, place):
    return res
+def print_arguments(args):
+    print('----------- lstm Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
 if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
    main()
--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@@ -191,25 +191,29 @@ def main():
                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
            accuracy.add(value=acc, weight=weight)
            iters += 1
-            num_samples += len(data)
+            num_samples += len(y_data)
            print(
                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
                (pass_id, iters, loss, acc)
            )  # The accuracy is the accumulation of batches, but not the current batch.
-        pass_train_acc = accuracy.eval()
+        # pass_train_acc = accuracy.eval()
        train_losses.append(loss)
        train_accs.append(acc)
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            pass_test_acc = test(exe)
-        train_elapsed = time.time() - start_time
+        exit(0)
-        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
-              (pass_id, np.mean(train_losses), np.mean(train_accs)))
 def print_arguments():
-    print('-----------  Configuration Arguments -----------')
+    print('----------- vgg Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')

--- a/benchmark/tensorflow/machine_translation.py
+++ b/benchmark/tensorflow/machine_translation.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from tensorflow.python.framework import dtypes
+from tensorflow.python.layers.core import Dense
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops.rnn_cell_impl import RNNCell, BasicLSTMCell
+from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple
+from tensorflow.contrib.rnn.python.ops import core_rnn_cell
+from tensorflow.python.ops import array_ops
+from tensorflow.python.util import nest
+import tensorflow.contrib.seq2seq as seq2seq
+from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
+import numpy as np
+import os
+import argparse
+import time
+import paddle.v2 as paddle
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--embedding_dim",
+    type=int,
+    default=512,
+    help="The dimension of embedding table. (default: %(default)d)")
+parser.add_argument(
+    "--encoder_size",
+    type=int,
+    default=512,
+    help="The size of encoder bi-rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--decoder_size",
+    type=int,
+    default=512,
+    help="The size of decoder rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--batch_size",
+    type=int,
+    default=128,
+    help="The sequence number of a mini-batch data. (default: %(default)d)")
+parser.add_argument(
+    "--dict_size",
+    type=int,
+    default=30000,
+    help="The dictionary capacity. Dictionaries of source sequence and "
+    "target dictionary have same capacity. (default: %(default)d)")
+parser.add_argument(
+    "--max_time_steps",
+    type=int,
+    default=81,
+    help="Max number of time steps for sequence. (default: %(default)d)")
+parser.add_argument(
+    "--pass_num",
+    type=int,
+    default=10,
+    help="The pass number to train. (default: %(default)d)")
+parser.add_argument(
+    "--learning_rate",
+    type=float,
+    default=0.0002,
+    help="Learning rate used to train the model. (default: %(default)f)")
+parser.add_argument(
+    "--infer_only", action='store_true', help="If set, run forward only.")
+parser.add_argument(
+    "--beam_size",
+    type=int,
+    default=3,
+    help="The width for beam searching. (default: %(default)d)")
+parser.add_argument(
+    "--max_generation_length",
+    type=int,
+    default=250,
+    help="The maximum length of sequence when doing generation. "
+    "(default: %(default)d)")
+parser.add_argument(
+    "--save_freq",
+    type=int,
+    default=500,
+    help="Save model checkpoint every this interation. (default: %(default)d)")
+parser.add_argument(
+    "--model_dir",
+    type=str,
+    default='./checkpoint',
+    help="Path to save model checkpoints. (default: %(default)d)")
+_Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
+START_TOKEN_IDX = 0
+END_TOKEN_IDX = 1
+class LSTMCellWithSimpleAttention(RNNCell):
+    """Add attention mechanism to BasicLSTMCell.
+    This class is a wrapper based on tensorflow's `BasicLSTMCell`.
+    """
+    def __init__(self,
+                 num_units,
+                 encoder_vector,
+                 encoder_proj,
+                 source_sequence_length,
+                 forget_bias=1.0,
+                 state_is_tuple=True,
+                 activation=None,
+                 reuse=None):
+        super(LSTMCellWithSimpleAttention, self).__init__(_reuse=reuse)
+        if not state_is_tuple:
+            logging.warn("%s: Using a concatenated state is slower and will "
+                         "soon be deprecated. Use state_is_tuple=True.", self)
+        self._num_units = num_units
+        # set padding part to 0
+        self._encoder_vector = self._reset_padding(encoder_vector,
+                                                   source_sequence_length)
+        self._encoder_proj = self._reset_padding(encoder_proj,
+                                                 source_sequence_length)
+        self._forget_bias = forget_bias
+        self._state_is_tuple = state_is_tuple
+        self._activation = activation or math_ops.tanh
+        self._linear = None
+    @property
+    def state_size(self):
+        return (LSTMStateTuple(self._num_units, self._num_units) \
+                if self._state_is_tuple else 2 * self._num_units)
+    @property
+    def output_size(self):
+        return self._num_units
+    def zero_state(self, batch_size, dtype):
+        state_size = self.state_size
+        if hasattr(self, "_last_zero_state"):
+            (last_state_size, last_batch_size, last_dtype,
+             last_output) = getattr(self, "_last_zero_state")
+            if (last_batch_size == batch_size and last_dtype == dtype and
+                    last_state_size == state_size):
+                return last_output
+        with ops.name_scope(
+                type(self).__name__ + "ZeroState", values=[batch_size]):
+            output = _zero_state_tensors(state_size, batch_size, dtype)
+        self._last_zero_state = (state_size, batch_size, dtype, output)
+        return output
+    def call(self, inputs, state):
+        sigmoid = math_ops.sigmoid
+        # Parameters of gates are concatenated into one multiply for efficiency.
+        if self._state_is_tuple:
+            c, h = state
+        else:
+            c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
+        # get context from encoder outputs
+        context = self._simple_attention(self._encoder_vector,
+                                         self._encoder_proj, h)
+        if self._linear is None:
+            self._linear = _Linear([inputs, context, h], 4 * self._num_units,
+                                   True)
+        # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+        i, j, f, o = array_ops.split(
+            value=self._linear([inputs, context, h]),
+            num_or_size_splits=4,
+            axis=1)
+        new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
+                 self._activation(j))
+        new_h = self._activation(new_c) * sigmoid(o)
+        if self._state_is_tuple:
+            new_state = LSTMStateTuple(new_c, new_h)
+        else:
+            new_state = array_ops.concat([new_c, new_h], 1)
+        return new_h, new_state
+    def _simple_attention(self, encoder_vec, encoder_proj, decoder_state):
+        """Implement the attention function.
+        The implementation has the same logic to the fluid decoder.
+        """
+        decoder_state_proj = tf.contrib.layers.fully_connected(
+            inputs=decoder_state,
+            num_outputs=self._num_units,
+            activation_fn=None,
+            biases_initializer=None)
+        decoder_state_expand = tf.tile(
+            tf.expand_dims(
+                input=decoder_state_proj, axis=1),
+            [1, tf.shape(encoder_proj)[1], 1])
+        concated = tf.concat([decoder_state_expand, encoder_proj], axis=2)
+        # need reduce the first dimension
+        attention_weights = tf.contrib.layers.fully_connected(
+            inputs=tf.reshape(
+                concated, shape=[-1, self._num_units * 2]),
+            num_outputs=1,
+            activation_fn=tf.nn.tanh,
+            biases_initializer=None)
+        attention_weights_reshaped = tf.reshape(
+            attention_weights, shape=[tf.shape(encoder_vec)[0], -1, 1])
+        # normalize the attention weights using softmax
+        attention_weights_normed = tf.nn.softmax(
+            attention_weights_reshaped, dim=1)
+        scaled = tf.multiply(attention_weights_normed, encoder_vec)
+        context = tf.reduce_sum(scaled, axis=1)
+        return context
+    def _reset_padding(self,
+                       memory,
+                       memory_sequence_length,
+                       check_inner_dims_defined=True):
+        """Reset the padding part for encoder inputs.
+        This funtion comes from tensorflow's `_prepare_memory` function.
+        """
+        memory = nest.map_structure(
+                lambda m: ops.convert_to_tensor(m, name="memory"), memory)
+        if memory_sequence_length is not None:
+            memory_sequence_length = ops.convert_to_tensor(
+                memory_sequence_length, name="memory_sequence_length")
+        if check_inner_dims_defined:
+            def _check_dims(m):
+                if not m.get_shape()[2:].is_fully_defined():
+                    raise ValueError(
+                        "Expected memory %s to have fully defined inner dims, "
+                        "but saw shape: %s" % (m.name, m.get_shape()))
+            nest.map_structure(_check_dims, memory)
+        if memory_sequence_length is None:
+            seq_len_mask = None
+        else:
+            seq_len_mask = array_ops.sequence_mask(
+                memory_sequence_length,
+                maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
+                dtype=nest.flatten(memory)[0].dtype)
+            seq_len_batch_size = (memory_sequence_length.shape[0].value or
+                                  array_ops.shape(memory_sequence_length)[0])
+        def _maybe_mask(m, seq_len_mask):
+            rank = m.get_shape().ndims
+            rank = rank if rank is not None else array_ops.rank(m)
+            extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
+            m_batch_size = m.shape[0].value or array_ops.shape(m)[0]
+            if memory_sequence_length is not None:
+                message = ("memory_sequence_length and memory tensor "
+                           "batch sizes do not match.")
+                with ops.control_dependencies([
+                        check_ops.assert_equal(
+                            seq_len_batch_size, m_batch_size, message=message)
+                ]):
+                    seq_len_mask = array_ops.reshape(
+                        seq_len_mask,
+                        array_ops.concat(
+                            (array_ops.shape(seq_len_mask), extra_ones), 0))
+                return m * seq_len_mask
+            else:
+                return m
+        return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask),
+                                  memory)
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size,
+                   max_generation_length):
+    src_word_idx = tf.placeholder(tf.int32, shape=[None, None])
+    src_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
+    src_embedding_weights = tf.get_variable("source_word_embeddings",
+                                            [source_dict_dim, embedding_dim])
+    src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx)
+    src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
+    src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
+    # no peephole
+    encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
+        cell_fw=src_forward_cell,
+        cell_bw=src_reversed_cell,
+        inputs=src_embedding,
+        sequence_length=src_sequence_length,
+        dtype=tf.float32)
+    # concat the forward outputs and backward outputs
+    encoded_vec = tf.concat(encoder_outputs, axis=2)
+    # project the encoder outputs to size of decoder lstm
+    encoded_proj = tf.contrib.layers.fully_connected(
+        inputs=tf.reshape(
+            encoded_vec, shape=[-1, embedding_dim * 2]),
+        num_outputs=decoder_size,
+        activation_fn=None,
+        biases_initializer=None)
+    encoded_proj_reshape = tf.reshape(
+        encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size])
+    # get init state for decoder lstm's H
+    backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1])
+    decoder_boot = tf.contrib.layers.fully_connected(
+        inputs=tf.reshape(
+            backword_first, shape=[-1, embedding_dim]),
+        num_outputs=decoder_size,
+        activation_fn=tf.nn.tanh,
+        biases_initializer=None)
+    # prepare the initial state for decoder lstm
+    cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32)
+    initial_state = LSTMStateTuple(cell_init, decoder_boot)
+    # create decoder lstm cell
+    decoder_cell = LSTMCellWithSimpleAttention(
+        decoder_size,
+        encoded_vec
+        if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size),
+        encoded_proj_reshape if not is_generating else
+        seq2seq.tile_batch(encoded_proj_reshape, beam_size),
+        src_sequence_length if not is_generating else
+        seq2seq.tile_batch(src_sequence_length, beam_size),
+        forget_bias=0.0)
+    output_layer = Dense(target_dict_dim, name='output_projection')
+    if not is_generating:
+        trg_word_idx = tf.placeholder(tf.int32, shape=[None, None])
+        trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
+        trg_embedding_weights = tf.get_variable(
+            "target_word_embeddings", [target_dict_dim, embedding_dim])
+        trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights,
+                                               trg_word_idx)
+        training_helper = seq2seq.TrainingHelper(
+            inputs=trg_embedding,
+            sequence_length=trg_sequence_length,
+            time_major=False,
+            name='training_helper')
+        training_decoder = seq2seq.BasicDecoder(
+            cell=decoder_cell,
+            helper=training_helper,
+            initial_state=initial_state,
+            output_layer=output_layer)
+        # get the max length of target sequence
+        max_decoder_length = tf.reduce_max(trg_sequence_length)
+        decoder_outputs_train, _, _ = seq2seq.dynamic_decode(
+            decoder=training_decoder,
+            output_time_major=False,
+            impute_finished=True,
+            maximum_iterations=max_decoder_length)
+        decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output)
+        decoder_pred_train = tf.argmax(
+            decoder_logits_train, axis=-1, name='decoder_pred_train')
+        masks = tf.sequence_mask(
+            lengths=trg_sequence_length,
+            maxlen=max_decoder_length,
+            dtype=tf.float32,
+            name='masks')
+        # place holder of label sequence
+        lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None])
+        # compute the loss
+        loss = seq2seq.sequence_loss(
+            logits=decoder_logits_train,
+            targets=lbl_word_idx,
+            weights=masks,
+            average_across_timesteps=True,
+            average_across_batch=True)
+        # return feeding list and loss operator
+        return {
+            'src_word_idx': src_word_idx,
+            'src_sequence_length': src_sequence_length,
+            'trg_word_idx': trg_word_idx,
+            'trg_sequence_length': trg_sequence_length,
+            'lbl_word_idx': lbl_word_idx
+        }, loss
+    else:
+        start_tokens = tf.ones([tf.shape(src_word_idx)[0], ],
+                               tf.int32) * START_TOKEN_IDX
+        # share the same embedding weights with target word
+        trg_embedding_weights = tf.get_variable(
+            "target_word_embeddings", [target_dict_dim, embedding_dim])
+        inference_decoder = beam_search_decoder.BeamSearchDecoder(
+            cell=decoder_cell,
+            embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens),
+            start_tokens=start_tokens,
+            end_token=END_TOKEN_IDX,
+            initial_state=tf.nn.rnn_cell.LSTMStateTuple(
+                tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size),
+                tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)),
+            beam_width=beam_size,
+            output_layer=output_layer)
+        decoder_outputs_decode, _, _ = seq2seq.dynamic_decode(
+            decoder=inference_decoder,
+            output_time_major=False,
+            #impute_finished=True,# error occurs
+            maximum_iterations=max_generation_length)
+        predicted_ids = decoder_outputs_decode.predicted_ids
+        return {
+            'src_word_idx': src_word_idx,
+            'src_sequence_length': src_sequence_length
+        }, predicted_ids
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in vars(args).iteritems():
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+def padding_data(data, padding_size, value):
+    data = data + [value] * padding_size
+    return data[:padding_size]
+def save(sess, path, var_list=None, global_step=None):
+    saver = tf.train.Saver(var_list)
+    save_path = saver.save(sess, save_path=path, global_step=global_step)
+    print('Model save at %s' % save_path)
+def restore(sess, path, var_list=None):
+    # var_list = None returns the list of all saveable variables
+    saver = tf.train.Saver(var_list)
+    saver.restore(sess, save_path=path)
+    print('model restored from %s' % path)
+def adapt_batch_data(data):
+    src_seq = map(lambda x: x[0], data)
+    trg_seq = map(lambda x: x[1], data)
+    lbl_seq = map(lambda x: x[2], data)
+    src_sequence_length = np.array(
+        [len(seq) for seq in src_seq]).astype('int32')
+    src_seq_maxlen = np.max(src_sequence_length)
+    trg_sequence_length = np.array(
+        [len(seq) for seq in trg_seq]).astype('int32')
+    trg_seq_maxlen = np.max(trg_sequence_length)
+    src_seq = np.array(
+        [padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
+         for seq in src_seq]).astype('int32')
+    trg_seq = np.array(
+        [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
+         for seq in trg_seq]).astype('int32')
+    lbl_seq = np.array(
+        [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
+         for seq in lbl_seq]).astype('int32')
+    return {
+        'src_word_idx': src_seq,
+        'src_sequence_length': src_sequence_length,
+        'trg_word_idx': trg_seq,
+        'trg_sequence_length': trg_sequence_length,
+        'lbl_word_idx': lbl_seq
+    }
+def train():
+    feeding_dict, loss = seq_to_seq_net(
+        embedding_dim=args.embedding_dim,
+        encoder_size=args.encoder_size,
+        decoder_size=args.decoder_size,
+        source_dict_dim=args.dict_size,
+        target_dict_dim=args.dict_size,
+        is_generating=False,
+        beam_size=args.beam_size,
+        max_generation_length=args.max_generation_length)
+    global_step = tf.Variable(0, trainable=False, name='global_step')
+    trainable_params = tf.trainable_variables()
+    optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+    gradients = tf.gradients(loss, trainable_params)
+    # may clip the parameters
+    clip_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
+    updates = optimizer.apply_gradients(
+        zip(gradients, trainable_params), global_step=global_step)
+    src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
+    train_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+    def do_validataion():
+        total_loss = 0.0
+        count = 0
+        for batch_id, data in enumerate(test_batch_generator()):
+            adapted_batch_data = adapt_batch_data(data)
+            outputs = sess.run([loss],
+                               feed_dict={
+                                   item[1]: adapted_batch_data[item[0]]
+                                   for item in feeding_dict.items()
+                               })
+            total_loss += outputs[0]
+            count += 1
+        return total_loss / count
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_l)
+        sess.run(init_g)
+        for pass_id in xrange(args.pass_num):
+            pass_start_time = time.time()
+            words_seen = 0
+            for batch_id, data in enumerate(train_batch_generator()):
+                adapted_batch_data = adapt_batch_data(data)
+                words_seen += np.sum(adapted_batch_data['src_sequence_length'])
+                words_seen += np.sum(adapted_batch_data['trg_sequence_length'])
+                outputs = sess.run([updates, loss],
+                                   feed_dict={
+                                       item[1]: adapted_batch_data[item[0]]
+                                       for item in feeding_dict.items()
+                                   })
+                print("pass_id=%d, batch_id=%d, train_loss: %f" %
+                      (pass_id, batch_id, outputs[1]))
+            pass_end_time = time.time()
+            test_loss = do_validataion()
+            time_consumed = pass_end_time - pass_start_time
+            words_per_sec = words_seen / time_consumed
+            print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
+                  (pass_id, test_loss, words_per_sec, time_consumed))
+def infer():
+    feeding_dict, predicted_ids = seq_to_seq_net(
+        embedding_dim=args.embedding_dim,
+        encoder_size=args.encoder_size,
+        decoder_size=args.decoder_size,
+        source_dict_dim=args.dict_size,
+        target_dict_dim=args.dict_size,
+        is_generating=True,
+        beam_size=args.beam_size,
+        max_generation_length=args.max_generation_length)
+    src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    with tf.Session(config=config) as sess:
+        restore(sess, './checkpoint/tf_seq2seq-1500')
+        for batch_id, data in enumerate(test_batch_generator()):
+            src_seq = map(lambda x: x[0], data)
+            source_language_seq = [
+                src_dict[item] for seq in src_seq for item in seq
+            ]
+            src_sequence_length = np.array(
+                [len(seq) for seq in src_seq]).astype('int32')
+            src_seq_maxlen = np.max(src_sequence_length)
+            src_seq = np.array([
+                padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
+                for seq in src_seq
+            ]).astype('int32')
+            outputs = sess.run([predicted_ids],
+                               feed_dict={
+                                   feeding_dict['src_word_idx']: src_seq,
+                                   feeding_dict['src_sequence_length']:
+                                   src_sequence_length
+                               })
+            print("\nDecoder result comparison: ")
+            source_language_seq = ' '.join(source_language_seq).lstrip(
+                '<s>').rstrip('<e>').strip()
+            inference_seq = ''
+            print(" --> source: " + source_language_seq)
+            for item in outputs[0][0]:
+                if item[0] == END_TOKEN_IDX: break
+                inference_seq += ' ' + trg_dict.get(item[0], '<unk>')
+            print(" --> inference: " + inference_seq)
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    if args.infer_only:
+        infer()
+    else:
+        train()
--- a/benchmark/tensorflow/mnist.py
+++ b/benchmark/tensorflow/mnist.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+import time
+import numpy as np
+import tensorflow as tf
+import paddle.v2 as paddle
+DTYPE = tf.float32
+def parse_args():
+    parser = argparse.ArgumentParser("mnist model benchmark.")
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--iterations', type=int, default=35, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=5, help='The number of passes.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    args = parser.parse_args()
+    return args
+def run_benchmark(args):
+    def weight_variable(dtype, shape):
+        initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
+        return tf.Variable(initial)
+    def bias_variable(dtype, shape):
+        initial = tf.constant(0.1, shape=shape, dtype=dtype)
+        return tf.Variable(initial)
+    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
+    with tf.device(device):
+        images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        # conv1, relu, pool1
+        conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
+        conv1_bias = bias_variable(DTYPE, [20])
+        conv1 = tf.nn.conv2d(
+            images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
+        relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
+        pool1 = tf.nn.max_pool(
+            relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
+        # conv2, relu, pool2
+        conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
+        conv2_bias = bias_variable(DTYPE, [50])
+        conv2 = tf.nn.conv2d(
+            pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
+        relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
+        pool2 = tf.nn.max_pool(
+            relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
+        # FC 
+        pool_shape = pool2.get_shape().as_list()
+        hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
+        reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
+        fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
+        fc_bias = bias_variable(DTYPE, [10])
+        logits = tf.matmul(reshape, fc_weights) + fc_bias
+        # Get prediction
+        prediction = tf.nn.softmax(logits)
+        # Loss 
+        one_hot_labels = tf.one_hot(labels, depth=10)
+        cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
+        avg_cost = tf.reduce_mean(cost)
+        # Get accuracy
+        correct = tf.equal(tf.argmax(prediction, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+        # metrics, g_accuracy
+        with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
+            g_accuracy = tf.metrics.accuracy(
+                labels, tf.argmax(
+                    prediction, axis=1))
+            vars = tf.contrib.framework.get_variables(
+                scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
+            g_accuracy_reset_op = tf.variables_initializer(vars)
+        # Optimizer 
+        opt = tf.train.AdamOptimizer(
+            learning_rate=0.001, beta1=0.9, beta2=0.999)
+        train_op = opt.minimize(avg_cost)
+        # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    def eval_test():
+        sess.run(g_accuracy_reset_op)
+        for batch_id, data in enumerate(test_reader()):
+            images_data = np.array(
+                map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
+            labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            loss, acc, g_acc = sess.run(
+                [avg_cost, accuracy, g_accuracy],
+                feed_dict={images: images_data,
+                           labels: labels_data})
+        return g_acc[1]
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+        for pass_id in range(args.pass_num):
+            sess.run(g_accuracy_reset_op)
+            pass_start = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                images_data = np.array(
+                    map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
+                labels_data = np.array(map(lambda x: x[1], data)).astype(
+                    "int64")
+                start = time.time()
+                _, loss, acc, g_acc = sess.run(
+                    [train_op, avg_cost, accuracy, g_accuracy],
+                    feed_dict={images: images_data,
+                               labels: labels_data})
+                end = time.time()
+                print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
+                      (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+            pass_end = time.time()
+            test_avg_acc = eval_test()
+            print(
+                "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
+                % (pass_id, g_acc[1], test_avg_acc,
+                   (pass_end - pass_start) / 1000))
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    run_benchmark(args)
--- a/benchmark/tensorflow/resnet.py
+++ b/benchmark/tensorflow/resnet.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py
+Get help: python resnet.py --help
+See performance on flowers: python resnet.py
+Train on cifar10: python resnet.py --data=cifar10 --with_test
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+import time
+import numpy as np
+import paddle.v2 as paddle
+import tensorflow as tf
+DTYPE = tf.float32
+def parse_args():
+    parser = argparse.ArgumentParser('Convolution model benchmark.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=['resnet'],
+        default='resnet',
+        help='The model architecture.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='use real data or fake data')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations',
+        type=int,
+        default=105,
+        help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=300, help='The number of passes.')
+    parser.add_argument(
+        '--order',
+        type=str,
+        default='NHWC',
+        choices=['NCHW', 'NHWC'],
+        help='The data order, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--data',
+        type=str,
+        default='flowers102',
+        choices=['flowers102', 'cifar10'],
+        help='The kinds of data.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    args = parser.parse_args()
+    return args
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[
+        'with_test'] else vars(args)['iterations']
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+def fixed_padding(inputs, kernel_size, data_format):
+    """Pads the input along the spatial dimensions independently of input size.
+  Args:
+    inputs: A tensor of size [batch, channels, height_in, width_in] or
+      [batch, height_in, width_in, channels] depending on data_format.
+    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
+                 Should be a positive integer.
+    data_format: The input format ('channels_last' or 'channels_first').
+  Returns:
+    A tensor with the same format as the input with the data either intact
+    (if kernel_size == 1) or padded (if kernel_size > 1).
+  """
+    pad_total = kernel_size - 1
+    pad_beg = pad_total // 2
+    pad_end = pad_total - pad_beg
+    if data_format == 'channels_first':
+        padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end],
+                                        [pad_beg, pad_end]])
+    else:
+        padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
+                                        [pad_beg, pad_end], [0, 0]])
+    return padded_inputs
+def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
+    """Strided 2-D convolution with explicit padding."""
+    # The padding is consistent and is based only on `kernel_size`, not on the
+    # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
+    # This is consistent with PaddlePaddle.
+    # In addition, the calculation for output size in TensorFlow can refer: 
+    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc
+    if strides > 1:
+        inputs = fixed_padding(inputs, kernel_size, data_format)
+    return tf.layers.conv2d(
+        inputs=inputs,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=('SAME' if strides == 1 else 'VALID'),
+        use_bias=False,
+        kernel_initializer=tf.variance_scaling_initializer(),
+        data_format=data_format)
+def conv_bn(inputs,
+            filters,
+            kernel_size,
+            strides,
+            is_training,
+            data_format,
+            act=True):
+    # def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
+    # set fused=True for a significant performance boost. See
+    # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
+    inputs = conv2d_fixed_padding(
+        inputs=inputs,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        data_format=data_format)
+    inputs = tf.layers.batch_normalization(
+        inputs=inputs,
+        axis=1 if data_format == 'channels_first' else 3,
+        momentum=0.9,
+        epsilon=1e-05,
+        center=True,
+        scale=True,
+        training=is_training,
+        fused=True)
+    if act:
+        inputs = tf.nn.relu(inputs)
+    return inputs
+def basicblock(inputs, filters, is_training, projection_shortcut, strides,
+               data_format):
+    shortcut = inputs
+    if projection_shortcut is not None:
+        shortcut = projection_shortcut(inputs)
+    inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format)
+    inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
+    inputs = inputs + shortcut
+    inputs = tf.nn.relu(inputs)
+    return inputs
+def bottleneck(inputs, filters, is_training, projection_shortcut, strides,
+               data_format):
+    shortcut = inputs
+    if projection_shortcut is not None:
+        shortcut = projection_shortcut(inputs)
+    inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format)
+    inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
+    inputs = conv_bn(
+        inputs, filters * 4, 1, 1, is_training, data_format, act=False)
+    inputs = inputs + shortcut
+    inputs = tf.nn.relu(inputs)
+    return inputs
+def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name,
+                data_format):
+    # Bottleneck blocks end with 4x the number of filters as they start with
+    filters_out = 4 * filters if block_fn is bottleneck else filters
+    def projection_shortcut(inputs):
+        return conv2d_fixed_padding(
+            inputs=inputs,
+            filters=filters_out,
+            kernel_size=1,
+            strides=strides,
+            data_format=data_format)
+    # Only the first block per block_layer uses projection_shortcut and strides
+    inputs = block_fn(inputs, filters, is_training, projection_shortcut,
+                      strides, data_format)
+    for _ in range(1, blocks):
+        inputs = block_fn(inputs, filters, is_training, None, 1, data_format)
+    return tf.identity(inputs, name)
+def resnet_imagenet(depth, class_dim, data_format):
+    """Returns the ResNet model for a given size and number of output classes."""
+    def resnet_generator(block_fn,
+                         layers,
+                         num_classes,
+                         data_format='channels_last'):
+        if data_format is None:
+            data_format = ('channels_first'
+                           if tf.test.is_built_with_cuda() else 'channels_last')
+        def model(inputs, is_training):
+            """Constructs the ResNet model given the inputs."""
+            if data_format == 'channels_first':
+                # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
+                # This provides a large performance boost on GPU. See
+                # https://www.tensorflow.org/performance/performance_guide#data_formats
+                inputs = tf.transpose(inputs, [0, 3, 1, 2])
+            inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format)
+            inputs = tf.identity(inputs, 'initial_conv')
+            inputs = tf.layers.max_pooling2d(
+                inputs=inputs,
+                pool_size=3,
+                strides=2,
+                padding='SAME',
+                data_format=data_format)
+            inputs = tf.identity(inputs, 'initial_max_pool')
+            inputs = block_layer(inputs, 64, block_fn, layers[0], 1,
+                                 is_training, 'block_layer1', data_format)
+            inputs = block_layer(inputs, 128, block_fn, layers[1], 2,
+                                 is_training, 'block_layer2', data_format)
+            inputs = block_layer(inputs, 256, block_fn, layers[2], 2,
+                                 is_training, 'block_layer3', data_format)
+            inputs = block_layer(inputs, 512, block_fn, layers[3], 2,
+                                 is_training, 'block_layer4', data_format)
+            inputs = tf.layers.average_pooling2d(
+                inputs=inputs,
+                pool_size=7,
+                strides=1,
+                padding='VALID',
+                data_format=data_format)
+            inputs = tf.identity(inputs, 'final_avg_pool')
+            inputs = tf.reshape(inputs,
+                                [-1, 512 if block_fn is basicblock else 2048])
+            inputs = tf.layers.dense(inputs=inputs, units=num_classes)
+            inputs = tf.identity(inputs, 'final_dense')
+            return inputs
+        return model
+    model_params = {
+        18: {
+            'block': basicblock,
+            'layers': [2, 2, 2, 2]
+        },
+        34: {
+            'block': basicblock,
+            'layers': [3, 4, 6, 3]
+        },
+        50: {
+            'block': bottleneck,
+            'layers': [3, 4, 6, 3]
+        },
+        101: {
+            'block': bottleneck,
+            'layers': [3, 4, 23, 3]
+        },
+        152: {
+            'block': bottleneck,
+            'layers': [3, 8, 36, 3]
+        },
+        200: {
+            'block': bottleneck,
+            'layers': [3, 24, 36, 3]
+        }
+    }
+    if depth not in model_params:
+        raise ValueError('Not a valid depth:', depth)
+    params = model_params[depth]
+    return resnet_generator(params['block'], params['layers'], class_dim,
+                            data_format)
+def resnet_cifar10(depth, num_classes, data_format):
+    if depth % 6 != 2:
+        raise ValueError('depth must be 6n + 2:', depth)
+    num_blocks = (depth - 2) // 6
+    if data_format is None:
+        data_format = ('channels_first'
+                       if tf.test.is_built_with_cuda() else 'channels_last')
+    def model(inputs, is_training):
+        inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format)
+        inputs = tf.identity(inputs, 'initial_conv')
+        inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training,
+                             'block_layer1', data_format)
+        inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training,
+                             'block_layer2', data_format)
+        inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training,
+                             'block_layer3', data_format)
+        inputs = tf.layers.average_pooling2d(
+            inputs=inputs,
+            pool_size=8,
+            strides=1,
+            padding='VALID',
+            data_format=data_format)
+        inputs = tf.identity(inputs, 'final_avg_pool')
+        inputs = tf.reshape(inputs, [-1, 64])
+        inputs = tf.layers.dense(inputs=inputs, units=num_classes)
+        inputs = tf.identity(inputs, 'final_dense')
+        return inputs
+    return model
+def run_benchmark(args, data_format='channels_last', device='/cpu:0'):
+    """Our model_fn for ResNet to be used with our Estimator."""
+    class_dim = 1000
+    dshape = (None, 224, 224, 3)
+    pdshape = (3, 224, 224)
+    if args.data == 'flowers102':
+        class_dim = 102
+        dshape = (None, 224, 224, 3)
+        pdshape = (3, 224, 224)
+    elif args.data == 'cifar10':
+        class_dim = 10
+        dshape = (None, 32, 32, 3)
+        pdshape = (3, 32, 32)
+    with tf.device(device):
+        images = tf.placeholder(DTYPE, shape=dshape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+        network = resnet_cifar10(
+            32, class_dim,
+            data_format) if args.data == 'cifar10' else resnet_imagenet(
+                50, class_dim, data_format)
+        logits = network(inputs=images, is_training=is_training)
+        cross_entropy = tf.losses.softmax_cross_entropy(
+            logits=logits, onehot_labels=onehot_labels)
+        avg_cost = tf.reduce_mean(cross_entropy)
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+        lr = 0.1 if args.data == 'cifar10' else 0.01
+        optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
+        # Batch norm requires update_ops to be added as a train_op dependency.
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_cost)
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=100)
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+                map(lambda x: np.transpose(x[0].reshape(pdshape),
+                axes=[1, 2, 0]), data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" %
+              (pass_id, num_samples / train_elapsed, np.mean(test_accs)))
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+        if args.use_fake_data:
+            data = train_reader().next()
+            images_data = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(pdshape),
+                    axes=[1, 2, 0]), data)).astype("float32")
+            labels_data = np.array(map(lambda x: x[1], data)).astype('int64')
+        iters, num_samples, start_time = 0, 0, 0.0
+        for pass_id in range(args.pass_num):
+            if iters == args.iterations:
+                break
+            train_accs = []
+            train_losses = []
+            for batch_id, data in enumerate(train_reader()):
+                if iters == args.skip_batch_num:
+                    start_time = time.time()
+                    num_samples = 0
+                if iters == args.iterations:
+                    break
+                if not args.use_fake_data:
+                    images_data = np.array(
+                        map(lambda x: np.transpose(x[0].reshape(pdshape),
+                        axes=[1, 2, 0]), data)).astype("float32")
+                    labels_data = np.array(map(lambda x: x[1], data)).astype(
+                        'int64')
+                _, loss, acc = sess.run([train_op, avg_cost, accuracy],
+                                        feed_dict={
+                                            images: images_data,
+                                            labels: labels_data,
+                                            is_training: True
+                                        })
+                iters += 1
+                train_accs.append(acc)
+                train_losses.append(loss)
+                num_samples += len(data)
+                print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" %
+                      (pass_id, iters, loss, acc))
+            train_elapsed = time.time() - start_time
+            print("Pass=%d, Loss=%f, Accuray=%f\n" %
+                  (pass_id, np.mean(train_losses), np.mean(train_accs)))
+            # evaluation
+            if args.with_test:
+                test()
+        if not args.with_test:
+            duration = time.time() - start_time
+            examples_per_sec = num_samples / duration
+            sec_per_batch = duration / (iters - args.skip_batch_num)
+            print('Total examples: %d, total time: %.5f' %
+                  (num_samples, duration))
+            print('%.5f examples/sec, %.5f sec/batch' %
+                  (examples_per_sec, sec_per_batch))
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if tf.test.is_built_with_cuda():
+        device = '/device:GPU:0'
+        if args.order == 'NHWC':
+            data_format = 'channels_last'
+        else:
+            data_format = 'channels_first'
+    else:
+        device = '/cpu:0'
+        if args.order == 'NHWC':
+            data_format = 'channels_last'
+        else:
+            raise ValueError('Only support NHWC order in CPU mode')
+    run_benchmark(args, data_format, device)
--- a/benchmark/tensorflow/stacked_dynamic_lstm.py
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import argparse
+import time
+import tensorflow as tf
+import paddle.v2 as paddle
+def parse_args():
+    parser = argparse.ArgumentParser("LSTM model benchmark.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--stacked_num',
+        type=int,
+        default=5,
+        help='Number of lstm layers to stack. (default: %(default)d)')
+    parser.add_argument(
+        '--embedding_dim',
+        type=int,
+        default=512,
+        help='Dimension of embedding table. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=512,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=10,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=0.0002,
+        help='Learning rate used to train. (default: %(default)f)')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    args = parser.parse_args()
+    return args
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+def dynamic_lstm_model(dict_size,
+                       embedding_dim,
+                       hidden_dim,
+                       stacked_num,
+                       class_num=2,
+                       is_train=True):
+    word_idx = tf.placeholder(tf.int64, shape=[None, None])
+    sequence_length = tf.placeholder(tf.int64, shape=[None, ])
+    embedding_weights = tf.get_variable('word_embeddings',
+                                        [dict_size, embedding_dim])
+    embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
+    lstm_cell = tf.nn.rnn_cell.LSTMCell(
+        num_units=hidden_dim, use_peepholes=False)
+    stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
+    # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
+    _, final_state = tf.nn.dynamic_rnn(
+        cell=stacked_cell,
+        inputs=embedding,
+        dtype=tf.float32,
+        sequence_length=sequence_length)
+    w = tf.Variable(
+        tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
+    bias = tf.Variable(
+        tf.constant(
+            value=0.0, shape=[class_num], dtype=tf.float32))
+    prediction = tf.matmul(final_state[-1][1], w) + bias
+    if not is_train:
+        return (word_idx, sequence_length), tf.nn.softmax(prediction)
+    label = tf.placeholder(tf.int64, shape=[None, ])
+    loss = tf.nn.softmax_cross_entropy_with_logits(
+        labels=tf.one_hot(label, 2), logits=prediction)
+    avg_loss = tf.reduce_mean(loss)
+    correct_count = tf.equal(tf.argmax(prediction, 1), label)
+    acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
+    with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
+        g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
+        vars = tf.contrib.framework.get_variables(
+            scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
+        reset_op = tf.variables_initializer(vars)
+    return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
+def padding_data(data, padding_size, value):
+    data = data + [value] * padding_size
+    return data[:padding_size]
+def train(args):
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_size = len(word_dict)
+    feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
+        dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
+    adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+    train_op = adam_optimizer.minimize(avg_loss)
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.test(word_dict), buf_size=25000),
+        batch_size=args.batch_size)
+    def do_validation(sess):
+        sess.run(reset_op)
+        for batch_id, data in enumerate(test_reader()):
+            word_idx = map(lambda x: x[0], data)
+            sequence_length = np.array(
+                [len(seq) for seq in word_idx]).astype('int64')
+            maxlen = np.max(sequence_length)
+            word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
+            word_idx = np.array(word_idx).astype('int64')
+            label = np.array(map(lambda x: x[1], data)).astype('int64')
+            _, loss, fetch_acc, fetch_g_acc = sess.run(
+                [train_op, avg_loss, acc, g_acc],
+                feed_dict={
+                    feeding_list[0]: word_idx,
+                    feeding_list[1]: sequence_length,
+                    feeding_list[2]: label
+                })
+        return fetch_g_acc[1]
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_l)
+        sess.run(init_g)
+        for pass_id in xrange(args.pass_num):
+            # clear accuracy local variable 
+            sess.run(reset_op)
+            pass_start_time = time.time()
+            words_seen = 0
+            for batch_id, data in enumerate(train_reader()):
+                word_idx = map(lambda x: x[0], data)
+                sequence_length = np.array(
+                    [len(seq) for seq in word_idx]).astype('int64')
+                words_seen += np.sum(sequence_length)
+                maxlen = np.max(sequence_length)
+                word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
+                word_idx = np.array(word_idx).astype('int64')
+                label = np.array(map(lambda x: x[1], data)).astype('int64')
+                _, loss, fetch_acc, fetch_g_acc = sess.run(
+                    [train_op, avg_loss, acc, g_acc],
+                    feed_dict={
+                        feeding_list[0]: word_idx,
+                        feeding_list[1]: sequence_length,
+                        feeding_list[2]: label
+                    })
+                print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
+                      % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
+            pass_end_time = time.time()
+            time_consumed = pass_end_time - pass_start_time
+            words_per_sec = words_seen / time_consumed
+            test_acc = do_validation(sess)
+            print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
+                  (pass_id, test_acc, words_per_sec, time_consumed))
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if args.infer_only:
+        pass
+    else:
+        train(args)
--- a/benchmark/tensorflow/vgg.py
+++ b/benchmark/tensorflow/vgg.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in TensorFlow"""
+import tensorflow as tf
+import paddle.v2 as paddle
+import numpy as np
+import argparse
+import time
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NHWC',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, NCHW=[batch, channels, height, width].'
+    'Only support NHWC right now.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+args = parser.parse_args()
+class VGG16Model(object):
+    def __init__(self):
+        self.parameters = []
+    def batch_norm_relu(self, inputs, is_training):
+        """Performs a batch normalization followed by a ReLU."""
+        # We set fused=True for a significant speed boost. See
+        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
+        inputs = tf.layers.batch_normalization(
+            inputs=inputs,
+            axis=1 if args.data_format == 'NCHW' else -1,
+            momentum=0.9,
+            epsilon=1e-05,
+            center=True,
+            scale=True,
+            training=is_training,
+            fused=True)
+        inputs = tf.nn.relu(inputs)
+        return inputs
+    def conv_bn_layer(self,
+                      name,
+                      images,
+                      kernel_shape,
+                      is_training,
+                      drop_rate=0.0):
+        with tf.name_scope(name) as scope:
+            kernel = tf.Variable(
+                tf.truncated_normal(
+                    kernel_shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            conv = tf.nn.conv2d(
+                images,
+                kernel, [1, 1, 1, 1],
+                data_format=args.data_format,
+                padding='SAME')
+            biases = tf.Variable(
+                tf.constant(
+                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(conv, biases)
+            out = self.batch_norm_relu(out, is_training)
+            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
+            return out
+    def fc_layer(self, name, inputs, shape):
+        with tf.name_scope(name) as scope:
+            fc_w = tf.Variable(
+                tf.truncated_normal(
+                    shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            fc_b = tf.Variable(
+                tf.constant(
+                    0.0, shape=[shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
+            return out
+    def network(self, images, class_dim, is_training):
+        """ VGG16 model structure.
+            TODO(kuke): enable this network to support the 'NCHW' data format
+        """
+        # conv1
+        conv1_1 = self.conv_bn_layer(
+            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
+        conv1_2 = self.conv_bn_layer(
+            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
+        # pool1
+        pool1 = tf.nn.max_pool(
+            conv1_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool1')
+        # conv2
+        conv2_1 = self.conv_bn_layer(
+            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
+        conv2_2 = self.conv_bn_layer(
+            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
+        # pool2
+        pool2 = tf.nn.max_pool(
+            conv2_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool2')
+        # conv3
+        conv3_1 = self.conv_bn_layer(
+            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
+        conv3_2 = self.conv_bn_layer(
+            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
+        conv3_3 = self.conv_bn_layer(
+            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
+        # pool3
+        pool3 = tf.nn.max_pool(
+            conv3_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool3')
+        # conv4
+        conv4_1 = self.conv_bn_layer(
+            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
+        conv4_2 = self.conv_bn_layer(
+            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv4_3 = self.conv_bn_layer(
+            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool4
+        pool4 = tf.nn.max_pool(
+            conv4_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # conv5
+        conv5_1 = self.conv_bn_layer(
+            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_2 = self.conv_bn_layer(
+            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_3 = self.conv_bn_layer(
+            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool5
+        pool5 = tf.nn.max_pool(
+            conv5_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # flatten
+        shape = int(np.prod(pool5.get_shape()[1:]))
+        pool5_flat = tf.reshape(pool5, [-1, shape])
+        # fc1
+        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
+        fc1 = self.fc_layer('fc1', drop, [shape, 512])
+        # fc2
+        bn = self.batch_norm_relu(fc1, is_training)
+        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
+        fc2 = self.fc_layer('fc2', drop, [512, 512])
+        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
+        return fc3
+def run_benchmark():
+    """Run benchmark on cifar10 or flowers."""
+    if args.data_set == "cifar10":
+        class_dim = 10
+        raw_shape = (3, 32, 32)
+        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
+            None, 3, 32, 32)
+    else:
+        class_dim = 102
+        raw_shape = (3, 224, 224)
+        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
+            None, 3, 224, 224)
+    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
+    with tf.device(device):
+        images = tf.placeholder(tf.float32, shape=dat_shape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+        vgg16 = VGG16Model()
+        logits = vgg16.network(images, class_dim, is_training)
+        loss = tf.losses.softmax_cross_entropy(
+            onehot_labels=onehot_labels, logits=logits)
+        avg_loss = tf.reduce_mean(loss)
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_loss)
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    # test
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+         map(lambda x: np.transpose(x[0].reshape(raw_shape),
+         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        return np.mean(test_accs)
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+        iters, num_samples, start_time = 0, 0, time.time()
+        for pass_id in range(args.num_passes):
+            # train
+            num_samples = 0
+            start_time = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                if iters == args.skip_batch_num:
+                    start_time = time.time()
+                    num_samples = 0
+                if iters == args.iterations:
+                    break
+                train_images = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
+                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+                train_labels = np.array(map(lambda x: x[1], data)).astype(
+                    'int64')
+                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
+                                        feed_dict={
+                                            images: train_images,
+                                            labels: train_labels,
+                                            is_training: True
+                                        })
+                iters += 1
+                num_samples += len(data)
+                print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
+                      (pass_id, iters, loss, acc))
+            train_elapsed = time.time() - start_time
+            # test
+            pass_test_acc = test()
+            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
+                  (pass_id, num_samples / train_elapsed, pass_test_acc))
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+if __name__ == '__main__':
+    print_arguments()
+    run_benchmark()
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -36,7 +36,8 @@ MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
-INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
+INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})

--- a/cmake/external/nccl.cmake
+++ b/cmake/external/nccl.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-if(NOT WITH_GPU)
-  return()
-endif()
-include(ExternalProject)
-set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
-include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
-if(WITH_DSO)
-  # If we use DSO, we do not build nccl, just download the dependencies
-  set(NCCL_BUILD_COMMAND "")
-  set(NCCL_INSTALL_COMMAND "")
-  set(NCCL_INSTALL_DIR "")
-else()
-  # otherwise, we build nccl and link it.
-  set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
-  # Note: cuda 8.0 is needed to make nccl
-  # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
-  set(NCCL_BUILD_COMMAND "make -j 8")
-  set(NCCL_INSTALL_COMMAND  "make install PREFIX=${NCCL_INSTALL_DIR}")
-endif()
-ExternalProject_Add(
-    extern_nccl
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
-    GIT_TAG         "v1.3.4-1"
-    PREFIX          "${NCCL_SOURCE_DIR}"
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     "${NCCL_BUILD_COMMAND}"
-    INSTALL_COMMAND   "${NCCL_INSTALL_COMMAND}"
-    INSTALL_DIR       "${NCCL_INSTALL_DIR}"
-    TEST_COMMAND      ""
-)
-if(WITH_DSO)
-  if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
-    add_library(nccl STATIC ${dummyfile})
-  else()
-    add_library(nccl INTERFACE)
-  endif()
-else()
-  add_library(nccl STATIC IMPORTED GLOBAL)
-  set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
-               ${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
-endif()
-add_dependencies(nccl extern_nccl)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -244,11 +244,11 @@ function(cc_test TARGET_NAME)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
    endif()
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@@ -311,8 +311,8 @@ function(nv_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(nv_test)
@@ -387,8 +387,8 @@ function(hip_test TARGET_NAME)
    endif()
    add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
-    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
-    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(hip_test)

--- a/doc/design/images/parallel_executor_overview.dot
+++ b/doc/design/images/parallel_executor_overview.dot
--- a/doc/design/images/parallel_executor_overview.png
+++ b/doc/design/images/parallel_executor_overview.png
--- a/doc/fluid/design/concepts/index_cn.rst
+++ b/doc/fluid/design/concepts/index_cn.rst
@@ -16,3 +16,4 @@
  block.md
  scope.md
  executor.md
+  parallel_executor.md
--- a/doc/fluid/design/concepts/index_en.rst
+++ b/doc/fluid/design/concepts/index_en.rst
@@ -16,3 +16,4 @@ Core Concepts
  block.md
  scope.md
  executor.md
+  parallel_executor.md
--- a/doc/design/parallel_executor.md
+++ b/doc/design/parallel_executor.md
--- a/doc/fluid/design/muti_devices/kernel_hint_design.md
+++ b/doc/fluid/design/muti_devices/kernel_hint_design.md
-# Problem
+# Kernel Hint Design
+## Problem
 In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
 In the current design, we use KernelType to describe one kernel.

--- a/doc/fluid/design/muti_devices/kernel_selection.md
+++ b/doc/fluid/design/muti_devices/kernel_selection.md
-# Background
+# Kernel Selection
+## Background
 Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
 The `OpKernelType ` is as follows:

--- a/doc/v2/build_and_install/index_en.rst
+++ b/doc/v2/build_and_install/index_en.rst
-Install and Build
+install and Compile
-=================
+==========
 .. _install_steps:
-Install Steps
+PaddlePaddle provides various methods of installation for many different users
-++++++++
-You can choose either pip or Docker to complete your install:
+Focus on Deep Learning Model Development
+-----------------
+PaddlePaddle provides lots of packages of python wheel , that pip can install:
 .. toctree::
-   :maxdepth: 1
+	:maxdepth: 1
-   pip_install_en.rst
+	pip_install_en.rst
-   docker_install_en.rst
-Build from Source
+This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
-----------------
+Follow the Bottom Frame
+----------
+PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
+.. toctree::
+	:maxdepth: 1
+	docker_install_en.rst
-..  warning::
+We recommend running PaddlePaddle in Docker. This method has the following advantages：
-    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
+- Does not require installation of third-party dependencies. 
+- Easy to share runtime environment. 
-..  toctree::
+Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below:
+.. toctree::
    :maxdepth: 1
-    build_from_source_en.md
+    build_from_source_en.rst
+.. warning::
+	One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming.
 FAQ
-++++++++++
+-----------
+For any problems during installation, please refer to the page below for answers:
+:ref:`常见问题解答 <install_faq>`
+If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community：
-`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
+`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -65,39 +65,55 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 不使用PaddlePaddle.org工具
 --------------------------
-使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档，即
+使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。该方法与 `从源码编译PaddlePaddle <http://paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html>`_ 相似，通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行，在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档，具体步骤如下：
-[TBD]
+.. code-block:: bash
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+   # 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
+   bash -x /paddle/paddle/scripts/docker/build.sh
+注：上述命令把当前目录（源码根目录）映射为 container 里的 :code:`/paddle` 目录。
+编译完成后，会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录，分别进入这些目录下，执行以下命令：
+.. code-block:: bash
+   python -m SimpleHTTPServer 8088
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。
 如果不想使用Docker，也可以使用以下命令直接构建PaddlePaddle文档，即
 .. code-block:: bash
-   mkdir paddle
-   cd paddle
   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
   mkdir -p build
   cd build
   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
   # 如果只需要构建使用文档，则执行以下命令
-   make -j $processors gen_proto_py
+   make -j $processors paddle_docs
-   make -j $processors paddle_docs paddle_docs_cn
   # 如果只需要构建API，则执行以下命令
-   make -j $processors gen_proto_py framework_py_proto
+   make -j $processors paddle_apis
-   make -j $processors copy_paddle_pybind
-   make -j $processors paddle_api_docs
 其中$processors代表启动和CPU核一样多的进程来并行编译，可以根据本机的CPU核数设置相应的值。
-编译完成后，进入 ``doc/v2`` 目录，如果选择构建文档则会在该目录下生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会生成 ``api/en/html`` 目录，分别进入这些目录下，执行以下命令：
+编译完成后，同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录，分别进入这些子目录下，执行以下命令：
 .. code-block:: bash
   python -m SimpleHTTPServer 8088
-在浏览器中输入 http://localhost:8088 就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
 ..  image:: src/doc_en.png
    :align: center

--- a/doc/v2/dev/write_docs_en.rst
+++ b/doc/v2/dev/write_docs_en.rst
@@ -68,39 +68,56 @@ Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develo
 Manually Building the Documentation
 -------------------------------------
-Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After Docker is installed, you could use the scripts in the source directory to build the documentation.
+Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. This method is quite similar to ` Build From Sources <http://paddlepaddle.org/docs/develop/documentation/en/build_and_install/build_from_source_en.html>`_ , by constructing, from source code, a docker image that can be used to build PaddlePaddle documentation. Enter the Docker container and use the script ``build.sh`` in the source directory to build the PaddlePaddle documentation. The specific steps are as follows:
-[TBD]
+.. code-block:: bash
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # Construct a docker image from source code
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+   # Use build.sh to build PaddlePaddle documentation
+   bash -x /paddle/paddle/scripts/docker/build.sh
+Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container.
+After compiling, there should be two generated directories: ``doc/v2`` and ``doc/fluid``, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
+.. code-block:: bash
+   python -m SimpleHTTPServer 8088
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page.
 If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
 .. code-block:: bash
-   mkdir paddle
-   cd paddle
   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
   mkdir -p build
   cd build
   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
   # If you only need to build documents, use the following commands
-   make -j $processors gen_proto_py
+   make -j $processors paddle_docs
-   make -j $processors paddle_docs paddle_docs_cn
   # If you only need to build APIs, use the following commands
-   make -j $processors gen_proto_py framework_py_proto
+   make -j $processors paddle_apis
-   make -j $processors copy_paddle_pybind
-   make -j $processors paddle_api_docs
 $processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
-After the compilation is complete, enter the ``doc/v2`` directory. If you chose to build documents, it will generate ``cn/html/`` and ``en/html`` subdirectories under this directory. If you chose to build APIs，it will generate``api/en/html`` subdirectory. Please enter these directories respectively and execute the following commands:
+After compiling, there also should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html``  will be generated in both two directories. If you chose to build APIs，a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
 .. code-block:: bash
   python -m SimpleHTTPServer 8088
-Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page. The following figure is an example of the built English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. The following figure is an example of the built ``v2`` 's English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
 ..  image:: src/doc_en.png
    :align: center

--- a/doc/v2/howto/capi/compile_paddle_lib_en.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_en.md
 ## Install and Build
-TBD
+### Download & Install 
+  Download the latest C-API development package from CI system and install. You can find the required version in the table below:
+<table>
+<thead>
+<tr>
+<th>Version Tips</th>
+<th>C-API</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cpu_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cpu_avx_openblas</td>
+<td>-</td>
+</tr>
+<tr>
+<td>cpu_noavx_openblas</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda7.5_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr></tbody></table>
+### From source
+  Users can also compile the C-API library from PaddlePaddle source code by compiling with the following compilation options:
+<table>
+<thead>
+<tr>
+<th>Options</th>
+<th>Value</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>WITH_C_API</td>
+<td>ON</td>
+</tr>
+<tr>
+<td>WITH_PYTHON</td>
+<td>OFF（recommended）</td>
+</tr>
+<tr>
+<td>WITH_SWIG_PY</td>
+<td>OFF（recommended）</td>
+</tr>
+<tr>
+<td>WITH_GOLANG</td>
+<td>OFF（recommended）</td>
+</tr>
+<tr>
+<td>WITH_GPU</td>
+<td>ON/OFF</td>
+</tr>
+<tr>
+<td>WITH_MKL</td>
+<td>ON/OFF</td>
+</tr></tbody></table>
+It is best to set up with recommended values to avoid linking with unnecessary libraries. Set other compilation options as you need.
+Pull the latest following code snippet from github, and configure compilation options(replace PADDLE_ROOT with the installation path of the PaddlePaddle C-API inference library):
+```shell
+PADDLE_ROOT=/path/of/capi
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_GOLANG=OFF \
+      -DWITH_PYTHON=OFF \
+      -DWITH_MKL=OFF \
+      -DWITH_GPU=OFF  \
+      ..
+```
+After running the above code to generate Makefile , run: `make && make install`.  After successful compilation, the dependencies required by C-API(includes: (1)PaddlePaddle inference library and header files; (2) Third-party libraries and header files) will be stored in the `PADDLE_ROOT` directory.
+If the compilation is successful, see the following directory structure under `PADDLE_ROOT`(includes PaddlePaddle header files and libraries, and third-party libraries and header files(determined by the link methods if necessary)):
+```text
+├── include
+│   └── paddle
+│       ├── arguments.h
+│       ├── capi.h
+│       ├── capi_private.h
+│       ├── config.h
+│       ├── error.h
+│       ├── gradient_machine.h
+│       ├── main.h
+│       ├── matrix.h
+│       ├── paddle_capi.map
+│       └── vector.h
+├── lib
+│   ├── libpaddle_capi_engine.a
+│   ├── libpaddle_capi_layers.a
+│   ├── libpaddle_capi_shared.so
+│   └── libpaddle_capi_whole.a
+└── third_party
+    ├── gflags
+    │   ├── include
+    │   │   └── gflags
+    │   │       ├── gflags_completions.h
+    │   │       ├── gflags_declare.h
+    │   │       ...
+    │   └── lib
+    │       └── libgflags.a
+    ├── glog
+    │   ├── include
+    │   │   └── glog
+    │   │       ├── config.h
+    │   │       ...
+    │   └── lib
+    │       └── libglog.a
+    ├── openblas
+    │   ├── include
+    │   │   ├── cblas.h
+    │   │   ...
+    │   └── lib
+    │       ...
+    ├── protobuf
+    │   ├── include
+    │   │   └── google
+    │   │       └── protobuf
+    │   │           ...
+    │   └── lib
+    │       └── libprotobuf-lite.a
+    └── zlib
+        ├── include
+        │   ...
+        └── lib
+            ...
+```
+### Linking Description:
+There are three kinds of linking methods:
+1. Linking with dynamic library `libpaddle_capi_shared.so`（This way is much more convenient and easier, **Without special requirements, it is recommended**）, refer to the following：
+    1. Compiling with CPU version and using `OpenBLAS`; only need to link one library named `libpaddle_capi_shared.so` to develop prediction program through C-API.
+    1. Compiling with CPU version and using `MKL` lib, you need to link MKL library directly to develop prediction program through PaddlePaddle C-API, due to `MKL` has its own dynamic library.
+    1. Compiling with GPU version, CUDA library will be loaded dynamically on prediction program run-time, and also set CUDA library to  `LD_LIBRARY_PATH` environment variable.
+2. Linking with static library `libpaddle_capi_whole.a`，refer to the following：
+    1. Specify `-Wl,--whole-archive` linking options.
+    1. Explicitly link third-party libraries such as `gflags`、`glog`、`libz`、`protobuf` .etc, you can find them under `PADDLE_ROOT/third_party` directory.
+    1. Use OpenBLAS library if compiling C-API，must explicitly link `libopenblas.a`.
+    1. Use MKL when compiling C-API, must explicitly link MKL dynamic library.
+3. Linking with static library `libpaddle_capi_layers.a` and `libpaddle_capi_engine.a`，refer to the following：
+    1. This linking methods is mainly used for mobile prediction.
+    1. Split `libpaddle_capi_whole.a` into two static linking library at least to reduce the size of linking libraries.
+    1. Specify `-Wl,--whole-archive -lpaddle_capi_layers`  and  `-Wl,--no-whole-archive -lpaddle_capi_engine` for linking.
+    1. The third-party dependencies need explicitly link same as method 2 above. 
--- a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
-# Kubernetes Distributed
+# Distributed Training on Kubernetes
-TBD
+We introduced how to create a PaddlePaddle Job with a single node on Kuberentes in the
+previous document.
+In this article, we will introduce how to create a PaddlePaddle job with multiple nodes
+on Kubernetes cluster.
+## Overall Architecture
+Before creating a training job, the users need to slice the training data and deploy
+the Python scripts along with it into the distributed file system
+(We can use the different type of Kuberentes Volumes to mount different distributed
+file systems). Before training starts, The program will copy the training data into the
+Container and also save the models at the same path during training. The global architecture
+is as follows:
+![PaddlePaddle on Kubernetes Architecture](src/k8s-paddle-arch.png)
+The above figure describes a distributed training architecture which contains 3 nodes, each 
+Pod mounts a folder of the distributed file system to save training data and models
+by Kubernetes Volume. Kubernetes created 3 Pods for this training phase and scheduled these on
+3 nodes, each Pod has a PaddlePaddle container. After the containers car created,
+PaddlePaddle starts up the communication between PServer and Trainer and read training
+data for this training job.
+As the description above, we can start up a PaddlePaddle distributed training job on a 
+Kubernetes ready cluster with the following steps:
+1. [Build PaddlePaddle Docker Image](#Build a Docker Image)
+1. [Split training data and upload to the distributed file system](#Upload Training Data)
+1. [Edit a YAML file and create a Kubernetes Job](#Create a Job)
+1. [Check the output](#Check The Output)
+We will introduce these steps as follows:
+### Build a Docker Image
+Training docker image needs to package the paddle pserver and paddle trainer runtimes, as well as two more processes before we can kick off the training:
+- Copying the training data into container.
+- Generating the initialization arguments for `Paddle PServer` and `Paddle Training` processes.
+Since the paddlepaddle official docker image already has the runtimes we need, we'll take it as the base image and pack some additional scripts for the processes mentioned above to build our training image. for more detail, please find from the following link:
+- https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile
+```bash
+$ cd doc/howto/usage/k8s/src/k8s_train
+$ docker build -t [YOUR_REPO]/paddle:mypaddle .
+```
+And then upload the new Docker Image to a Docker hub:
+```bash
+docker push  [YOUR_REPO]/paddle:mypaddle
+```
+**[NOTE]**, in the above command arguments, `[YOUR_REPO]` represents your Docker repository,
+you need to use your repository instead of it. We will replace it with your respository name to
+represent the Docker Image which built in this step.
+### Prepare Training Data
+We can download and split the training job by creating a Kubernetes Job, or custom your image
+by editing [k8s_train](./src/k8s_train/).
+Before creating a Job, we need to bind a [persistenVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes) by the different type of
+the different file system, the generated dataset would be saved on this volume.
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      hostNetwork: true
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/mnt"
+          name: nfs
+        env:
+        - name: OUT_DIR
+          value: /home/work/mfs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: nfs
+          persistentVolumeClaim:
+            claimName: mfs
+      restartPolicy: Never
+```
+Create the Job with the following command:
+```bash
+> kubectl create -f xxx.yaml
+```
+If created successfully, you can see some information like this:
+```base
+[root@paddle-kubernetes-node0 nfsdir]$ tree -d
+.
+`-- paddle-cluster-job
+    |-- 0
+    |   `-- data
+    |-- 1
+    |   `-- data
+    |-- 2
+    |   `-- data
+    |-- output
+    |-- quick_start
+```
+The `paddle-cluster-job` above is the job name for this training job; we need 3
+PaddlePaddle training nodes and save the split training data in `paddle-cluster-job` path,
+the folder `0`, `1` and `2` represents the `training_id` on each node, `quick_start` folder is used to store training data, `output` folder is used to store the models and logs.
+### Create a Job
+Kubernetes allow users to create objects with YAML files, and we can use a command-line tool
+to create it.
+The Job YAML file describes that which Docker Image would be used in this training job, how much nodes would be created, what's the startup arguments of `Paddle PServer/Trainer` process and what's the type of Volumes. You can find the details of the YAML filed in
+[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job).
+The following is an example for this training job:
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/work/mfs
+      containers:
+      - name: trainer
+        image: [YOUR_REPO]/paddle:mypaddle
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+      restartPolicy: Never
+```
+In the above YAML file:
+- `metadata.name`, The job name.
+- `parallelism`, Whether the Kubernetes Job would create `parallelism` Pods at the same time.
+- `completions`, The Job would become the success status only when the number of successful Pod(the exit code is 0)
+  is equal to `completions`.
+- `volumeMounts`, the name field `jobpath` is a key, the `mountPath` field represents
+  the path in the container, and we can define the `jobpath` in `volumes` filed, use `hostPath`
+  to configure the host path we want to mount.
+- `env`, the environment variables in the Container, we pass some startup arguments by
+  this approach, some details are as following:
+  - JOB_PATH：the mount path in the container
+  - JOB_NAME：the job name
+  - TRAIN_CONFIG_DIR：the job path in the container, we can find the training data path by
+    combine with JOB_NAME.
+  - CONF_PADDLE_NIC: the argument `--nics` of `Paddle PServer` process, the network
+    device name.
+  - CONF_PADDLE_PORT: the argument `--port` of `Paddle PServer` process.
+  - CONF_PADDLE_PORTS_NUM: the argument `--ports_num` of `Paddle PServer`, the port number
+    for dense prameter update. 
+  - CONF_PADDLE_PORTS_NUM_SPARSE：the argument `--ports_num_for_sparse` of `Paddle PServer`,
+    the port number for sparse parameter update.
+  - CONF_PADDLE_GRADIENT_NUM：the number of training node, the argument 
+  `--num_gradient_servers` of `Paddle PServer` and `Paddle Trainer`.
+You can find some details information at [here]
+(http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。
+We can use the command-line tool of Kubernetes to create a Job when we finish the YAML file:
+```bash
+kubectl create -f job.yaml
+```
+Upon successful creation, Kubernetes would create 3 Pods as PaddlePaddle training node,
+pull the Docker image and begin to train.
+### Checkout the Output
+At the process of training, we can check the logs and the output models which is stored in
+the `output` folder.
+**NOTE**, `node_0`, `node_1` and `node_2` represent the
+`trainer_id` of the PaddlePaddle training job rather than the node id of Kubernetes.
+```bash
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+We can checkout the status of each training Pod by viewing the logs:
+```bash
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0
+    --log_period=50 --dot_period=10 --saving_period=1
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
+## Some Additional Details
+### Using Environment Variables
+Usually we use the environment varialbes to configurate the PaddlePaddle Job which runs in
+Kubernetes, `start_paddle.py` provides a start up script to convert the environment variable
+to the start up arguments of PaddlePaddle process:
+```bash
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+```
+### Communication between Pods
+At the begin of `start_paddle.py`, it would initializes and parses the arguments.
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+And then query the status of all the other Pods of this Job by the function `getPodList()`, and fetch `triner_id` by the function `getIdMap(podlist)` if all the Pods status is `RUNNING`.
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+**NOTE**: `getPodList()` would prefetch all the Pods in the current namespace, if some 
+Pods are alreay running, it may cause some error. We will use [statfulesets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets) instead of
+Kubernetes Pod or Replicaset in the future.
+The function `getIdMap(podlist)` fetches IPs addresses of `podlist` and then sort them
+to generate `trainer_id`.
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+After getting the `idMap`, we can generate the arguments of `Paddle PServer` and `Paddle Trainer`
+so that we can start up them by `startPaddle(idMap, train_args_dict)`.
+### Create Job
+The main goal of `startPaddle` is generating the arguments of `Paddle PServer` and
+`Paddle Trainer` processes. Take `Paddle Trainer` as an example, we parse the
+environment variable and then get `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM` and etc...,
+finally find `trainerId` from `idMap` according to its IP address.
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
+.timestamp
 *.o
 *.a
 .svn

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -7,9 +7,9 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place memory device_context framework_proto)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place memory device_context framework_proto)
 endif()
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -21,9 +21,9 @@ endif()
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
-cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)

--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
+#include <queue>
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include <queue>
 namespace paddle {
 namespace framework {
@@ -147,52 +146,7 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
    return;
  }
-  auto get_vars = [](std::deque<std::unique_ptr<OpDesc>>::iterator &op,
+  ops_.erase(ops_.begin() + s, ops_.begin() + e);
-                     std::vector<std::string> &v) {
-    auto in_names = (*op)->InputArgumentNames();
-    v.insert(v.end(), in_names.begin(), in_names.end());
-    auto out_names = (*op)->OutputArgumentNames();
-    v.insert(v.end(), out_names.begin(), out_names.end());
-    std::sort(v.begin(), v.end());
-    auto last = std::unique(v.begin(), v.end());
-    v.erase(last, v.end());
-  };
-  need_update_ = true;
-  for (size_t i = s; i < e; i++) {
-    // since remove op one by one, every time remove the first op.
-    auto op = ops_.begin() + s;
-    // collect input and output variables from current delete op
-    std::vector<std::string> cur_vars;
-    get_vars(op, cur_vars);
-    // remove current op
-    ops_.erase(ops_.begin() + s);
-    // collect input and output variables from other ops
-    std::vector<std::string> other_vars;
-    for (auto it = ops_.begin(); it != ops_.end(); it++) {
-      get_vars(it, other_vars);
-    }
-    // variables should be deleted
-    std::vector<std::string> delete_vars;
-    // delete_vars = cur_vars -  cur_vars ^ other_input_vars
-    std::set_difference(cur_vars.begin(), cur_vars.end(), other_vars.begin(),
-                        other_vars.end(),
-                        std::inserter(delete_vars, delete_vars.end()));
-    // remove variables
-    for (size_t i = 0; i < delete_vars.size(); i++) {
-      auto name = delete_vars[i];
-      auto it = vars_.find(name);
-      PADDLE_ENFORCE(it != vars_.end(),
-                     "%s is not in variable list, it should not be deleted",
-                     name);
-      vars_.erase(it);
-      VLOG(3) << "deleting variable " << name;
-    }
-  }
 }
 std::vector<OpDesc *> BlockDesc::AllOps() const {

--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -105,7 +105,7 @@ static void BuildVar(const std::string& param_name,
 TEST(Operator, CPUtoGPU) {
  using namespace paddle::framework;
  using namespace paddle::platform;
-  InitDevices();
+  InitDevices(true);
  paddle::framework::Scope scope;
  paddle::platform::CPUPlace cpu_place;

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -5,6 +5,7 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod
 nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
        dynload_cuda)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
@@ -15,7 +16,7 @@ else()
    set(multi_devices_graph_builder_deps)
 endif()
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-            scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
+            scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/details/send_op_handle.h"
 #include "paddle/fluid/framework/scope.h"
 #ifdef PADDLE_WITH_CUDA
@@ -54,12 +55,37 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
  }
 }
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
+                                                const platform::Place &p,
+                                                const size_t &i) const {
+  auto *op_handle = result->ops_.back().get();
+  op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(p));
+  auto var_names = op->InputArgumentNames();
+  for (auto &each_var_name : var_names) {
+    VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
+    op_handle->AddInput(var);
+  }
+  var_names = op->OutputArgumentNames();
+  for (auto &each_var_name : var_names) {
+    CreateOpOutput(result, op_handle, each_var_name, p, i);
+  }
+}
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    const ProgramDesc &program) const {
  auto graph = new SSAGraph();
  SSAGraph &result = *graph;
  std::unordered_set<std::string> og_has_been_broadcast;
-  result.vars_.resize(places_.size());
+  // We cannot invoke resize. It is a bug of GCC 4.8
+  result.vars_ = std::vector<
+      std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
+      places_.size());
  bool is_forwarding = true;
  for (auto *op : program.Block(0).AllOps()) {
@@ -72,27 +98,28 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
      }
    }
+    // append send op if program is distributed trainer main program.
+    // always use the first device
+    if (!is_forwarding && op->Type() == "send") {
+      auto &p = places_[0];
+      auto *s = local_scopes_[0];
+      // FIXME(wuyi): send op always copy from GPU 0
+      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
+      // Create inputs for output on original place and no ssa output
+      // is created for send op.
+      CreateOpHandleIOs(&result, op, p, 0);
+      continue;
+    }
    for (size_t i = 0; i < places_.size(); ++i) {
      auto &p = places_[i];
      auto *s = local_scopes_[i];
      result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
      auto *op_handle = result.ops_.back().get();
-      op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
+      CreateOpHandleIOs(&result, op, p, i);
-          platform::DeviceContextPool::Instance().Get(p));
-      auto var_names = op->InputArgumentNames();
-      for (auto &each_var_name : var_names) {
+      auto var_names = op->OutputArgumentNames();
-        VarHandle *var =
-            CreateOrGetLatestVarHandle(&result, each_var_name, p, i);
-        op_handle->AddInput(var);
-      }
-      var_names = op->OutputArgumentNames();
-      for (auto &each_var_name : var_names) {
-        CreateOpOutput(&result, op_handle, each_var_name, p, i);
-      }
      if (is_forwarding) {
        if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
@@ -147,15 +174,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
            if (vars.empty()) {  // This device has no data. continue.
              continue;
            }
-            auto *prev_grad = &vars[vars.size() - 1];
+            auto &prev_grad = vars[vars.size() - 1];
-            op_handle->AddInput(prev_grad);
+            op_handle->AddInput(prev_grad.get());
-            auto &var = vars[vars.size()];
+            vars.emplace_back(new VarHandle);
-            var.place_ = p;
+            auto &var = vars.back();
-            var.name_ = og;
+            var->place_ = p;
-            var.version_ = vars.size() - 1;
+            var->name_ = og;
+            var->version_ = vars.size() - 1;
-            op_handle->AddOutput(&var);
+            op_handle->AddOutput(var.get());
          }
 #else
          PADDLE_ENFORCE("Not implemented");

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -14,6 +14,9 @@
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 namespace paddle {
@@ -41,6 +44,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
+ private:
+  void CreateOpHandleIOs(SSAGraph *result, OpDesc *op, const platform::Place &p,
+                         const size_t &i) const;
 private:
  std::string loss_var_name_;
  const std::vector<platform::Place> &places_;

--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/send_op_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/send_op_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+SendOpHandle::SendOpHandle(const framework::OpDesc &op_desc,
+                           const Scope *local_scope,
+                           const platform::Place &place)
+    : op_(framework::OpRegistry::CreateOp(op_desc)),
+      local_scope_(local_scope),
+      place_(place) {}
+void SendOpHandle::RunImpl() {
+  // Wait input done
+  for (auto *in : inputs_) {
+    auto &p = static_cast<VarHandle *>(in)->place_;
+    if (in->DebugString() == "dummy") {  // HACK
+      continue;
+    }
+    in->generated_op_->Wait(dev_ctxes_[p]);
+  }
+  op_->Run(*local_scope_, place_);
+}
+std::string SendOpHandle::Name() const { return "send"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/send_op_handle.h
+++ b/paddle/fluid/framework/details/send_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct SendOpHandle : public OpHandleBase {
+  std::unique_ptr<OperatorBase> op_;
+  const Scope* local_scope_;
+  const platform::Place& place_;
+  SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
+               const platform::Place& place);
+  std::string Name() const override;
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
+  bool IsMultiDeviceTransfer() override { return false; };
+ protected:
+  void RunImpl() override;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph.h
+++ b/paddle/fluid/framework/details/ssa_graph.h
@@ -16,6 +16,8 @@
 #include <map>
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/var_handle.h"
@@ -24,7 +26,9 @@ namespace framework {
 namespace details {
 struct SSAGraph {
-  std::vector<std::unordered_map<std::string, std::map<int, VarHandle>>> vars_;
+  std::vector<
+      std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
+      vars_;
  // aux variables to represent dependency. Useful to resolve data hazard.
  std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_;
  std::vector<std::unique_ptr<OpHandleBase>> ops_;

--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -27,8 +27,8 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
      auto it_old = name_pair.second.rbegin();
      ++it_old;
      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
-        auto *write_op = it_new->second.generated_op_;
+        auto *write_op = (*it_new)->generated_op_;
-        auto &read_ops = it_old->second.pending_ops_;
+        auto &read_ops = (*it_old)->pending_ops_;
        for (auto *read_op : read_ops) {
          // Manually add a dependency var from read_op to write_op;
@@ -54,14 +54,15 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
  auto &var_holder = var_holders[each_var_name];
  VarHandle *var = nullptr;
  if (var_holder.empty()) {
+    var_holder.emplace_back(new VarHandle);
    auto &init_var = var_holder[0];
-    init_var.place_ = place;
+    init_var->place_ = place;
-    init_var.name_ = each_var_name;
+    init_var->name_ = each_var_name;
-    init_var.generated_op_ = nullptr;
+    init_var->generated_op_ = nullptr;
-    init_var.version_ = 0;
+    init_var->version_ = 0;
-    var = &init_var;
+    var = init_var.get();
  } else {
-    var = &var_holder.rbegin()->second;
+    var = var_holder.rbegin()->get();
  }
  return var;
 }
@@ -72,11 +73,12 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
                                     size_t place_offset) {
  auto &vars = graph->vars_[place_offset][each_var_name];
  size_t version = vars.size();
-  auto &var = vars[version];
+  vars.emplace_back(new VarHandle());
-  var.version_ = version;
+  auto &var = vars.back();
-  var.name_ = each_var_name;
+  var->version_ = version;
-  var.place_ = place;
+  var->name_ = each_var_name;
-  op_handle->AddOutput(&var);
+  var->place_ = place;
+  op_handle->AddOutput(var.get());
 }
 template <typename Callback>
@@ -84,7 +86,7 @@ void IterAllVar(const SSAGraph &graph, Callback callback) {
  for (auto &each : graph.vars_) {
    for (auto &pair1 : each) {
      for (auto &pair2 : pair1.second) {
-        callback(pair2.second);
+        callback(*pair2);
      }
    }
  }

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -69,7 +69,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  for (auto &var_map : graph_->vars_) {
    for (auto &name_pair : var_map) {
      for (auto &version_pair : name_pair.second) {
-        InsertPendingVar(version_pair.second);
+        InsertPendingVar(*version_pair);
      }
    }
  }
@@ -95,7 +95,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    for (auto &var_map : graph_->vars_) {
      auto it = var_map.find(fetch_var_name);
      if (it != var_map.end()) {
-        fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second);
+        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
      }
    }
  }

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -93,6 +93,43 @@ static void CheckTensorNANOrInf(const std::string& name,
                 "Tensor %s contains NAN", name);
 }
+void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
+                               int block_id) {
+  auto& global_block = pdesc.Block(block_id);
+  const Scope* ancestor_scope = scope;
+  while (ancestor_scope->parent()) {
+    ancestor_scope = ancestor_scope->parent();
+  }
+  if (ancestor_scope != scope) {
+    for (auto& var : global_block.AllVars()) {
+      if (var->Name() == framework::kEmptyVarName) {
+        continue;
+      }
+      if (var->Persistable()) {
+        auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " global, which pointer is " << ptr;
+      } else {
+        auto* ptr = scope->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " locally, which pointer is " << ptr;
+      }
+    }
+  } else {
+    for (auto& var : global_block.AllVars()) {
+      auto* ptr = scope->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+              << ptr;
+    }
+  }
+}
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                   bool create_local_scope, bool create_vars) {
  platform::RecordBlock b(block_id);
@@ -188,8 +225,8 @@ static bool has_fetch_operators(
 void Executor::Run(const ProgramDesc& program, Scope* scope,
                   std::map<std::string, const LoDTensor*>& feed_targets,
                   std::map<std::string, LoDTensor*>& fetch_targets,
-                   const std::string& feed_holder_name,
+                   bool create_vars, const std::string& feed_holder_name,
-                   const std::string& fetch_holder_name, bool create_vars) {
+                   const std::string& fetch_holder_name) {
  platform::RecordBlock b(kProgramId);
  bool has_feed_ops =
      has_feed_operators(program.Block(0), feed_targets, feed_holder_name);
@@ -282,38 +319,13 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                  bool create_local_scope, bool create_vars) {
-  auto& block = ctx->prog_.Block(ctx->block_id_);
  Scope* local_scope = scope;
  if (create_vars) {
    if (create_local_scope) {
      local_scope = &scope->NewScope();
-      for (auto& var : block.AllVars()) {
+    }
-        if (var->Name() == framework::kEmptyVarName) {
+    CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
-          continue;
+  }
-        }
-        if (var->Persistable()) {
-          auto* ptr = scope->Var(var->Name());
-          InitializeVariable(ptr, var->GetType());
-          VLOG(3) << "Create Variable " << var->Name()
-                  << " global, which pointer is " << ptr;
-        } else {
-          auto* ptr = local_scope->Var(var->Name());
-          InitializeVariable(ptr, var->GetType());
-          VLOG(3) << "Create Variable " << var->Name()
-                  << " locally, which pointer is " << ptr;
-        }
-      }
-    } else {
-      for (auto& var : block.AllVars()) {
-        auto* ptr = local_scope->Var(var->Name());
-        InitializeVariable(ptr, var->GetType());
-        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-                << ptr;
-      }
-    }  // if (create_local_scope)
-  }    // if (create_vars)
  for (auto& op : ctx->ops_) {
    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -54,9 +54,9 @@ class Executor {
  void Run(const ProgramDesc& program, Scope* scope,
           std::map<std::string, const LoDTensor*>& feed_targets,
           std::map<std::string, LoDTensor*>& fetch_targets,
+           bool create_vars = true,
           const std::string& feed_holder_name = "feed",
-           const std::string& fetch_holder_name = "fetch",
+           const std::string& fetch_holder_name = "fetch");
-           bool create_vars = true);
  static std::unique_ptr<ExecutorPrepareContext> Prepare(
      const ProgramDesc& program, int block_id);
@@ -64,6 +64,8 @@ class Executor {
  static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
      const ProgramDesc& program, const std::vector<int>& block_ids);
+  void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                          bool create_local_scope = true,
                          bool create_vars = true);

--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
@@ -64,7 +64,7 @@ void InitP2P(int count) {
 #endif
 }
-void InitDevices() {
+void InitDevices(bool init_p2p) {
  /*Init all avaiable devices by default */
  std::vector<platform::Place> places;
@@ -85,7 +85,9 @@ void InitDevices() {
  for (int i = 0; i < count; ++i) {
    places.emplace_back(platform::CUDAPlace(i));
  }
-  InitP2P(count);
+  if (init_p2p) {
+    InitP2P(count);
+  }
  platform::DeviceContextPool::Init(places);
 }

--- a/paddle/fluid/framework/init.h
+++ b/paddle/fluid/framework/init.h
@@ -24,7 +24,7 @@ void InitGflags(std::vector<std::string> &argv);
 void InitGLOG(const std::string &prog_name);
-void InitDevices();
+void InitDevices(bool init_p2p);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/init_test.cc
+++ b/paddle/fluid/framework/init_test.cc
@@ -21,7 +21,7 @@ TEST(InitDevices, CPU) {
  using paddle::platform::DeviceContextPool;
 #ifndef PADDLE_WITH_CUDA
-  InitDevices();
+  InitDevices(true);
  DeviceContextPool& pool = DeviceContextPool::Instance();
  ASSERT_EQ(pool.size(), 1U);
 #endif
@@ -33,7 +33,7 @@ TEST(InitDevices, CUDA) {
 #ifdef PADDLE_WITH_CUDA
  int count = paddle::platform::GetCUDADeviceCount();
-  InitDevices();
+  InitDevices(true);
  DeviceContextPool& pool = DeviceContextPool::Instance();
  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
 #endif

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -12,9 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/lod_tensor.h"
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <iterator>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
@@ -22,11 +27,6 @@ limitations under the License. */
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
-#include <stdint.h>
-#include <string.h>
-#include <algorithm>
-#include <iterator>
 namespace paddle {
 namespace framework {
@@ -294,7 +294,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
  TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
-void WriteToRecordIO(recordio::Writer &writer,
+void WriteToRecordIO(recordio::Writer *writer,
                     const std::vector<LoDTensor> &tensor,
                     const platform::DeviceContext &dev_ctx) {
  std::stringstream buffer;
@@ -303,18 +303,20 @@ void WriteToRecordIO(recordio::Writer &writer,
  for (auto &each : tensor) {
    SerializeToStream(buffer, each, dev_ctx);
  }
-  writer.Write(buffer.str());
+  writer->Write(buffer.str());
 }
 std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner &scanner, const platform::DeviceContext &dev_ctx) {
+    recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) {
-  std::istringstream sin(scanner.Next());
-  uint32_t sz;
-  sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
  std::vector<LoDTensor> result;
-  result.resize(sz);
+  if (scanner->HasNext()) {
-  for (uint32_t i = 0; i < sz; ++i) {
+    std::istringstream sin(scanner->Next());
-    DeserializeFromStream(sin, &result[i], dev_ctx);
+    uint32_t sz;
+    sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+    result.resize(sz);
+    for (uint32_t i = 0; i < sz; ++i) {
+      DeserializeFromStream(sin, &result[i], dev_ctx);
+    }
  }
  return result;
 }

--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #pragma once
 #include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 #ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
@@ -216,12 +219,12 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
 void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
                           const platform::DeviceContext& dev_ctx);
-extern void WriteToRecordIO(recordio::Writer& writer,
+extern void WriteToRecordIO(recordio::Writer* writer,
                            const std::vector<LoDTensor>& tensor,
                            const platform::DeviceContext& dev_ctx);
 extern std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner& scanner, const platform::DeviceContext& dev_ctx);
+    recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/recordio/scanner.h"
-#include "paddle/fluid/recordio/writer.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/recordio/scanner.h"
+#include "paddle/fluid/recordio/writer.h"
 namespace paddle {
 namespace framework {
@@ -240,8 +240,8 @@ TEST(LoDTensor, RecordIO) {
      *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
  {
    recordio::Writer writer(stream, recordio::Compressor::kSnappy);
-    WriteToRecordIO(writer, {tensor, tensor}, ctx);
+    WriteToRecordIO(&writer, {tensor, tensor}, ctx);
-    WriteToRecordIO(writer, {tensor, tensor}, ctx);
+    WriteToRecordIO(&writer, {tensor, tensor}, ctx);
    writer.Flush();
  }
@@ -254,11 +254,11 @@ TEST(LoDTensor, RecordIO) {
  {
    std::unique_ptr<std::istream> stream_ptr(stream);
    recordio::Scanner scanner(std::move(stream_ptr));
-    auto tensors = ReadFromRecordIO(scanner, ctx);
+    auto tensors = ReadFromRecordIO(&scanner, ctx);
    ASSERT_EQ(tensors.size(), 2);
    assert_tensor_ok(tensors[0]);
    assert_tensor_ok(tensors[1]);
-    tensors = ReadFromRecordIO(scanner, ctx);
+    tensors = ReadFromRecordIO(&scanner, ctx);
    ASSERT_EQ(tensors.size(), 2);
    assert_tensor_ok(tensors[0]);
    assert_tensor_ok(tensors[1]);

--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -30,7 +30,7 @@ __global__ void test(size_t* a, int size) {
 }
 TEST(LoD, data) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
  paddle::framework::LoD lod{{0, 1, 2}};
  lod.push_back({0, 2, 4, 5});
@@ -46,7 +46,7 @@ TEST(LoD, data) {
 }
 TEST(LoDTensor, LoDInGPU) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
  paddle::framework::LoDTensor lod_tensor;
  paddle::platform::CUDAPlace place(0);

--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -72,7 +72,7 @@ REGISTER_OP_WITHOUT_GRADIENT(test_operator,
                             paddle::framework::OpWithoutKernelCheckerMaker);
 TEST(OperatorBase, all) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("test_operator");
  BuildVar("input", {"IN1"}, op_desc.add_inputs());
@@ -198,7 +198,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
 // test with single input
 TEST(OpKernel, all) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("op_with_kernel");
  BuildVar("x", {"IN1"}, op_desc.add_inputs());
@@ -228,7 +228,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
  using namespace paddle::framework;
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
  proto::OpDesc op_desc;
  op_desc.set_type("op_multi_inputs_with_kernel");
@@ -269,7 +269,7 @@ class OperatorClone : public paddle::framework::OperatorBase {
 };
 TEST(Operator, Clone) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
  OperatorClone a("ABC", paddle::framework::VariableNameMap{},
                  paddle::framework::VariableNameMap{},
                  paddle::framework::AttributeMap{});

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
-#include "paddle/fluid/platform/profiler.h"
 #include <string>
 #include <vector>
@@ -24,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace framework {
@@ -43,30 +43,40 @@ class ParallelExecutorPrivate {
 #endif
 };
+std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
+  return member_->local_scopes_;
+}
 ParallelExecutor::ParallelExecutor(
    size_t num_threads, bool use_event,
    const std::vector<platform::Place> &places,
    const std::unordered_set<std::string> &params,
-    const ProgramDesc &startup_program, const ProgramDesc &main_program,
+    const std::unordered_set<std::string> &bcast_vars,
-    const std::string &loss_var_name, Scope *scope, bool allow_op_delay)
+    const ProgramDesc &main_program, const std::string &loss_var_name,
+    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;
-  // Step 1. RunStartupProgram and Bcast the params to devs.
+  // Step 1. Bcast the params to devs.
-  Executor exe(places[0]);
-  exe.Run(startup_program, scope, 0);
  // Create local scopes
-  for (size_t i = 0; i < member_->places_.size(); ++i) {
+  if (local_scopes.empty()) {
-    member_->local_scopes_.push_back(&scope->NewScope());
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.push_back(&scope->NewScope());
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.push_back(local_scopes[i]);
+    }
  }
 // Bcast Parameters to all GPUs
 #ifdef PADDLE_WITH_CUDA
  member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
 #endif
-  if (platform::is_gpu_place(places[0]) &&
+  if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
-      member_->local_scopes_.size() != 1) {  // Is CUDA
+      local_scopes.empty()) {  // Is CUDA
-    BCastParamsToGPUs(startup_program);
+    BCastParamsToGPUs(bcast_vars);
  }
 // Startup Program has been run. All local scopes has correct parameters.
@@ -99,48 +109,45 @@ ParallelExecutor::ParallelExecutor(
 }
 void ParallelExecutor::BCastParamsToGPUs(
-    const ProgramDesc &startup_program) const {
+    const std::unordered_set<std::string> &vars) const {
 #ifdef PADDLE_WITH_CUDA
  auto *main_scope = member_->local_scopes_[0];
-  for (auto *var_desc : startup_program.Block(0).AllVars()) {
+  for (auto &var : vars) {
-    size_t idx = var_desc->Name().find("@GRAD");
+    auto *main_var = main_scope->FindVar(var);
-    if (idx != std::string::npos) continue;
+    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
-    if (var_desc->GetType() == proto::VarType::LOD_TENSOR) {
+      continue;
-      auto &main_tensor =
+    }
-          main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
+    auto &main_tensor = main_var->Get<LoDTensor>();
-      auto &dims = main_tensor.dims();
+    auto &dims = main_tensor.dims();
+    if (paddle::platform::is_gpu_place(main_tensor.place())) {
-      if (paddle::platform::is_gpu_place(main_tensor.place())) {
+      size_t numel = main_tensor.numel();
-        size_t numel = main_tensor.numel();
+      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-        ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
+      platform::NCCLGroupGuard guard;
-        platform::NCCLGroupGuard guard;
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        for (size_t i = 0; i < member_->places_.size(); ++i) {
+        auto place = member_->places_[i];
-          auto place = member_->places_[i];
+        void *buffer;
-          void *buffer;
+        if (i == 0) {
-          if (i == 0) {
+          buffer = const_cast<void *>(main_tensor.data<void>());
-            buffer = const_cast<void *>(main_tensor.data<void>());
+        } else {
-          } else {
-            auto local_scope = member_->local_scopes_[i];
-            auto *t =
-                local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
-            t->Resize(dims);
-            buffer = t->mutable_data(place, main_tensor.type());
-          }
-          auto &nccl_ctx = member_->nccl_ctxs_->at(place);
-          platform::dynload::ncclBcast(buffer, numel, data_type, 0,
-                                       nccl_ctx.comm_, nccl_ctx.stream());
-        }
-      } else {
-        platform::CPUPlace cpu;
-        for (size_t i = 1; i < member_->places_.size(); ++i) {
          auto local_scope = member_->local_scopes_[i];
-          auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
+          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
          t->Resize(dims);
-          t->mutable_data(cpu, main_tensor.type());
+          buffer = t->mutable_data(place, main_tensor.type());
-          paddle::framework::TensorCopy(main_tensor, cpu, t);
        }
+        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
+        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
+                                     nccl_ctx.comm_, nccl_ctx.stream());
+      }
+    } else {
+      platform::CPUPlace cpu;
+      for (size_t i = 1; i < member_->places_.size(); ++i) {
+        auto local_scope = member_->local_scopes_[i];
+        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
+        t->Resize(dims);
+        t->mutable_data(cpu, main_tensor.type());
+        paddle::framework::TensorCopy(main_tensor, cpu, t);
      }
    }
    member_->nccl_ctxs_->WaitAll();
@@ -165,12 +172,17 @@ void ParallelExecutor::SplitTensorToPlaces(
    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
  for (auto it : feed_tensors) {
    auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
+    PADDLE_ENFORCE_EQ(
+        member_->places_.size(), lod_tensors.size(),
+        "The number of samples of current batch is less than the count of "
+        "devices, currently, it is not allowed. (%d vs %d)",
+        member_->places_.size(), lod_tensors.size());
    for (size_t j = 0; j < member_->places_.size(); ++j) {
      // TODO(panxy0718): Do I need to delete this var?
-      member_->local_scopes_[j]
+      auto t =
-          ->Var(it.first)
+          member_->local_scopes_[j]->Var(it.first)->GetMutable<LoDTensor>();
-          ->GetMutable<LoDTensor>()
+      t->ShareDataWith(lod_tensors[j]);
-          ->ShareDataWith(lod_tensors[j]);
+      t->set_lod(lod_tensors[j].lod());
    }
  }
 }

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -36,22 +36,25 @@ class ParallelExecutor {
  explicit ParallelExecutor(size_t num_threads, bool use_event,
                            const std::vector<platform::Place>& places,
                            const std::unordered_set<std::string>& params,
-                            const ProgramDesc& startup_program,
+                            const std::unordered_set<std::string>& bcast_vars,
                            const ProgramDesc& main_program,
                            const std::string& loss_var_name, Scope* scope,
+                            const std::vector<Scope*>& local_scopes,
                            bool allow_op_delay);
+  std::vector<Scope*>& GetLocalScopes();
  void Run(const std::vector<std::string>& fetch_tensors,
           const std::string& fetched_var_name,
           const std::unordered_map<std::string, LoDTensor>& feed_tensors);
+  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
 private:
  void SplitTensorToPlaces(
      const std::unordered_map<std::string, LoDTensor>& feed_tensors);
  ParallelExecutorPrivate* member_;
-  void BCastParamsToGPUs(const ProgramDesc& startup_program) const;
 };
 }  // namespace framework

--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -85,9 +85,9 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
 }
 const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
-  BlockDesc *global_block = blocks_[0].get();
+  auto &global_block = Block(0);
  std::vector<std::string> feed_target_names;
-  for (auto *op : global_block->AllOps()) {
+  for (auto *op : global_block.AllOps()) {
    if (op->Type() == kFeedOpType) {
      feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
    }
@@ -96,9 +96,9 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
 }
 const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
-  BlockDesc *global_block = blocks_[0].get();
+  auto &global_block = Block(0);
  std::vector<std::string> fetch_target_names;
-  for (auto *op : global_block->AllOps()) {
+  for (auto *op : global_block.AllOps()) {
    if (op->Type() == kFetchOpType) {
      fetch_target_names.push_back(op->Input("X")[0]);
    }
@@ -106,5 +106,43 @@ const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
  return fetch_target_names;
 }
+void ProgramDesc::SetFeedHolderName(const std::string &feed_holder_name) {
+  auto *global_block = MutableBlock(0);
+  int index = 0;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      // Unify the input's name of all feed_ops to feed_holder_name
+      global_block->RemoveVar(op->Input("X")[0]);
+      op->SetInput("X", {feed_holder_name});
+      op->SetAttr("col", {index});
+      op->CheckAttrs();
+      index++;
+    }
+  }
+  auto *feed_holder = global_block->Var(feed_holder_name);
+  feed_holder->SetType(proto::VarType::FEED_MINIBATCH);
+  feed_holder->SetPersistable(true);
+}
+void ProgramDesc::SetFetchHolderName(const std::string &fetch_holder_name) {
+  auto *global_block = MutableBlock(0);
+  int index = 0;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      // Unify the output's name of all fetch_ops to fetch_holder_name
+      global_block->RemoveVar(op->Output("Out")[0]);
+      op->SetOutput("Out", {fetch_holder_name});
+      op->SetAttr("col", {index});
+      op->CheckAttrs();
+      index++;
+    }
+  }
+  auto *fetch_holder = global_block->Var(fetch_holder_name);
+  fetch_holder->SetType(proto::VarType::FETCH_LIST);
+  fetch_holder->SetPersistable(true);
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <memory>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -52,9 +53,26 @@ class ProgramDesc {
  proto::ProgramDesc *Proto();
+  // The output variable of feed_op is referenced as feed_target.
+  // This function is used to collect the output variable's name of all
+  // feed_ops.
  const std::vector<std::string> GetFeedTargetNames();
+  // The input variable of fetch_op is referenced as fetch_target.
+  // This function is used to collect the input variable's name of all
+  // fetch_ops.
  const std::vector<std::string> GetFetchTargetNames();
+  // The input variable of feed_op that holds input Tensor provided by users is
+  // referenced as feed_holder.
+  // This function is used to change or unify the feed_holder variables' name.
+  void SetFeedHolderName(const std::string &feed_holder_name);
+  // The output variable of fetch_op that holds output Tensor needed by users is
+  // referenced as fetch_holder.
+  // This function is used to change or unify the fetch_holder variables' name.
+  void SetFetchHolderName(const std::string &fetch_holder_name);
 private:
  proto::ProgramDesc desc_;

--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -22,7 +22,9 @@ FileReader::FileReader(const std::vector<DDim> &dims) : dims_(dims) {}
 void FileReader::ReadNext(std::vector<LoDTensor> *out) {
  ReadNextImpl(out);
-  PADDLE_ENFORCE_EQ(out->size(), dims_.size());
+  if (out->empty()) {
+    return;
+  }
  for (size_t i = 0; i < dims_.size(); ++i) {
    auto &actual = out->at(i).dims();
    auto &expect = dims_[i];

--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -14,14 +14,13 @@
 #pragma once
+#include <memory>
+#include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/platform/place.h"
-#include <memory>
-#include <thread>
-#include <vector>
 namespace paddle {
 namespace framework {
@@ -31,8 +30,6 @@ class ReaderBase {
  virtual void ReInit() = 0;
-  virtual bool HasNext() const = 0;
  virtual ~ReaderBase();
 };
@@ -44,8 +41,6 @@ class DecoratedReader : public ReaderBase {
  void ReInit() override { reader_->ReInit(); }
-  bool HasNext() const override { return reader_->HasNext(); }
 protected:
  ReaderBase* reader_;
 };
@@ -80,8 +75,6 @@ class ReaderHolder {
    reader_->ReInit();
  }
-  bool HasNext() const { return reader_->HasNext(); }
 private:
  std::unique_ptr<ReaderBase> reader_;
 };

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include <memory>  // for unique_ptr
-#include <mutex>   // for call_once
 #include <set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/threadpool.h"
@@ -39,6 +38,7 @@ Scope::~Scope() {
 }
 Scope& Scope::NewScope() const {
+  std::unique_lock<std::mutex> lock(mutex_);
  kids_.push_back(new Scope(this));
  return *kids_.back();
 }
@@ -92,6 +92,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 void Scope::DeleteScope(Scope* scope) {
+  std::unique_lock<std::mutex> lock(mutex_);
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
@@ -103,7 +104,7 @@ void Scope::DeleteScope(Scope* scope) {
  }
 }
-void Scope::EraseVars(std::vector<std::string>& var_names) {
+void Scope::EraseVars(const std::vector<std::string>& var_names) {
  std::set<std::string> var_set(var_names.begin(), var_names.end());
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <list>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -51,13 +52,13 @@ class Scope {
  /// Create a variable with a scope-unique name.
  Variable* Var(std::string* name = nullptr);
-  void EraseVars(std::vector<std::string>& var_names);
+  void EraseVars(const std::vector<std::string>& var_names);
  /// Find a variable in the scope or any of its ancestors.  Returns
  /// nullptr if cannot find.
  Variable* FindVar(const std::string& name) const;
-  const Scope& parent() const { return *parent_; }
+  const Scope* parent() const { return parent_; }
  /// Find the scope or an ancestor scope that contains the given variable.
  const Scope* FindScope(const Variable* var) const;
@@ -88,6 +89,9 @@ class Scope {
  Scope const* parent_{nullptr};
  DISABLE_COPY_AND_ASSIGN(Scope);
+ private:
+  mutable std::mutex mutex_;
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
-set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init)
 cc_library(paddle_fluid_api
    SRCS io.cc

--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -24,7 +24,8 @@ function(inference_test TARGET_NAME)
  endforeach()
 endfunction(inference_test)
-inference_test(fit_a_line)
+# This unittest is buggy!
+#inference_test(fit_a_line)
 inference_test(image_classification ARGS vgg resnet)
 inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)

--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#include "paddle/fluid/inference/tests/test_multi_thread_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -26,32 +27,63 @@ TEST(inference, fit_a_line) {
  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-  paddle::framework::LoDTensor input;
+  for (int num_threads : {1, 2}) {
-  // The second dim of the input tensor should be 13
+    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_feeds;
-  // The input data should be >= 0
+    cpu_feeds.resize(num_threads);
-  int64_t batch_size = 10;
+    for (int i = 0; i < num_threads; ++i) {
-  SetupTensor<float>(&input, {batch_size, 13}, static_cast<float>(0),
+      auto* input = new paddle::framework::LoDTensor();
-                     static_cast<float>(10));
+      // The second dim of the input tensor should be 13
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+      // The input data should be >= 0
-  cpu_feeds.push_back(&input);
+      int64_t batch_size = 10;
+      SetupTensor<float>(input, {batch_size, 13}, static_cast<float>(0),
+                         static_cast<float>(10));
+      cpu_feeds[i].push_back(input);
+    }
-  paddle::framework::LoDTensor output1;
+    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+    cpu_fetchs1.resize(num_threads);
-  cpu_fetchs1.push_back(&output1);
+    for (int i = 0; i < num_threads; ++i) {
+      auto* output = new paddle::framework::LoDTensor();
+      cpu_fetchs1[i].push_back(output);
+    }
-  // Run inference on CPU
+    // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+    LOG(INFO) << "--- CPU Runs (num_threads: " << num_threads << "): ---";
-  LOG(INFO) << output1.dims();
+    if (num_threads == 1) {
+      TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds[0],
+                                                cpu_fetchs1[0]);
+    } else {
+      TestMultiThreadInference<paddle::platform::CPUPlace>(
+          dirname, cpu_feeds, cpu_fetchs1, num_threads);
+    }
 #ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
+    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+    cpu_fetchs2.resize(num_threads);
-  cpu_fetchs2.push_back(&output2);
+    for (int i = 0; i < num_threads; ++i) {
+      auto* output = new paddle::framework::LoDTensor();
+      cpu_fetchs2[i].push_back(output);
+    }
-  // Run inference on CUDA GPU
+    // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+    LOG(INFO) << "--- GPU Runs (num_threads: " << num_threads << "): ---";
-  LOG(INFO) << output2.dims();
+    if (num_threads == 1) {
+      TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds[0],
+                                                 cpu_fetchs2[0]);
+    } else {
+      TestMultiThreadInference<paddle::platform::CUDAPlace>(
+          dirname, cpu_feeds, cpu_fetchs2, num_threads);
+    }
-  CheckError<float>(output1, output2);
+    for (int i = 0; i < num_threads; ++i) {
+      CheckError<float>(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]);
+      delete cpu_fetchs2[i][0];
+    }
 #endif
+    for (int i = 0; i < num_threads; ++i) {
+      delete cpu_feeds[i][0];
+      delete cpu_fetchs1[i][0];
+    }
+  }  // num_threads-loop
 }
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -46,8 +46,8 @@ TEST(inference, image_classification) {
  // Run inference on CPU
  LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace, true>(dirname, cpu_feeds,
+  TestInference<paddle::platform::CPUPlace, false, true>(
-                                                  cpu_fetchs1, FLAGS_repeat);
+      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
  LOG(INFO) << output1.dims();
 #ifdef PADDLE_WITH_CUDA
@@ -57,8 +57,8 @@ TEST(inference, image_classification) {
  // Run inference on CUDA GPU
  LOG(INFO) << "--- GPU Runs: ---";
-  TestInference<paddle::platform::CUDAPlace, true>(dirname, cpu_feeds,
+  TestInference<paddle::platform::CUDAPlace, false, true>(
-                                                   cpu_fetchs2, FLAGS_repeat);
+      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
  LOG(INFO) << output2.dims();
  CheckError<float>(output1, output2);

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -25,7 +25,8 @@ limitations under the License. */
 template <typename T>
 void SetupTensor(paddle::framework::LoDTensor* input,
                 paddle::framework::DDim dims, T lower, T upper) {
-  std::mt19937 rng(100);  // An arbitrarily chosen but fixed seed.
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
  std::uniform_real_distribution<double> uniform_dist(0, 1);
  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
@@ -88,7 +89,7 @@ void CheckError(const paddle::framework::LoDTensor& output1,
  EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
 }
-template <typename Place, bool PrepareContext = false>
+template <typename Place, bool CreateVars = true, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
@@ -166,6 +167,13 @@ void TestInference(const std::string& dirname,
  // 6. Run the inference program
  {
+    if (!CreateVars) {
+      // If users don't want to create and destroy variables every time they
+      // run, they need to set `create_vars` to false and manually call
+      // `CreateVariables` before running.
+      executor.CreateVariables(*inference_program, scope, 0);
+    }
    // Ignore the profiling results of the first run
    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
    if (PrepareContext) {
@@ -173,7 +181,8 @@ void TestInference(const std::string& dirname,
      executor.RunPreparedContext(ctx.get(), scope, feed_targets,
                                  fetch_targets);
    } else {
-      executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+      executor.Run(*inference_program, scope, feed_targets, fetch_targets,
+                   CreateVars);
    }
    // Enable the profiler
@@ -191,7 +200,8 @@ void TestInference(const std::string& dirname,
        executor.RunPreparedContext(ctx.get(), scope, feed_targets,
                                    fetch_targets);
      } else {
-        executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+        executor.Run(*inference_program, scope, feed_targets, fetch_targets,
+                     CreateVars);
      }
    }

--- a/paddle/fluid/inference/tests/test_multi_thread_helper.h
+++ b/paddle/fluid/inference/tests/test_multi_thread_helper.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <map>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/io.h"
+void ThreadedRunInference(
+    const std::unique_ptr<paddle::framework::ProgramDesc>& inference_program,
+    paddle::framework::Executor* executor, paddle::framework::Scope* scope,
+    const int thread_id,
+    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
+    const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+  auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
+      new paddle::framework::ProgramDesc(*inference_program));
+  std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id);
+  std::string fetch_holder_name =
+      "fetch_" + paddle::string::to_string(thread_id);
+  copy_program->SetFeedHolderName(feed_holder_name);
+  copy_program->SetFetchHolderName(fetch_holder_name);
+  // 3. Get the feed_target_names and fetch_target_names
+  const std::vector<std::string>& feed_target_names =
+      copy_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      copy_program->GetFetchTargetNames();
+  // 4. Prepare inputs: set up maps for feed targets
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  for (size_t i = 0; i < feed_target_names.size(); ++i) {
+    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+    feed_targets[feed_target_names[i]] = cpu_feeds[i];
+  }
+  // 5. Define Tensor to get the outputs: set up maps for fetch targets
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+  }
+  // 6. Run the inference program
+  executor->Run(*copy_program, scope, feed_targets, fetch_targets, true,
+                feed_holder_name, fetch_holder_name);
+}
+template <typename Place>
+void TestMultiThreadInference(
+    const std::string& dirname,
+    const std::vector<std::vector<paddle::framework::LoDTensor*>>& cpu_feeds,
+    const std::vector<std::vector<paddle::framework::LoDTensor*>>& cpu_fetchs,
+    const int num_threads) {
+  // 1. Define place, executor, scope
+  auto place = Place();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+  // 2. Initialize the inference_program and load parameters
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program =
+      paddle::inference::Load(executor, *scope, dirname);
+  std::vector<std::thread*> threads;
+  for (int i = 0; i < num_threads; ++i) {
+    threads.push_back(new std::thread(
+        ThreadedRunInference, std::ref(inference_program), &executor, scope, i,
+        std::ref(cpu_feeds[i]), std::ref(cpu_fetchs[i])));
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i]->join();
+    delete threads[i];
+  }
+  delete scope;
+}
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
 add_subdirectory(detail)
-cc_library(memory SRCS memory.cc DEPS place enforce)
+cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce)
 cc_library(memcpy SRCS memcpy.cc DEPS place)
-cc_library(paddle_memory
+cc_library(memory
        DEPS
-        memory
+        malloc
-        memcpy
+        memcpy)
-        meta_data
-        meta_cache
-        memory_block
-        buddy_allocator
-        system_allocator)
-cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
+cc_test(malloc_test SRCS malloc_test.cc DEPS malloc)
 #if (WITH_GPU)
-#   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place paddle_memory)
+#   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
 #endif()
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
+cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc)
 if(${WITH_GPU})
  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
@@ -6,10 +8,4 @@ endif(${WITH_GPU})
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
-cc_library(meta_data SRCS meta_data.cc)
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
-cc_library(meta_cache SRCS meta_cache.cc)
-cc_library(memory_block SRCS memory_block.cc)
-cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -46,7 +46,8 @@ inline size_t align(size_t size, size_t alignment) {
 void* BuddyAllocator::Alloc(size_t unaligned_size) {
  // adjust allocation alignment
-  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+  size_t size =
+      align(unaligned_size + sizeof(MemoryBlock::Desc), min_chunk_size_);
  // acquire the allocator lock
  std::lock_guard<std::mutex> lock(mutex_);
@@ -103,7 +104,7 @@ void BuddyAllocator::Free(void* p) {
    return;
  }
-  block->mark_as_free(cache_);
+  block->mark_as_free(&cache_);
  total_used_ -= block->total_size(cache_);
  total_free_ += block->total_size(cache_);
@@ -122,7 +123,7 @@ void BuddyAllocator::Free(void* p) {
                                   right_buddy));
      // merge its right buddy to the block
-      block->merge(cache_, right_buddy);
+      block->merge(&cache_, right_buddy);
    }
  }
@@ -139,7 +140,7 @@ void BuddyAllocator::Free(void* p) {
                                   left_buddy->total_size(cache_), left_buddy));
      // merge the block to its left buddy
-      left_buddy->merge(cache_, block);
+      left_buddy->merge(&cache_, block);
      block = left_buddy;
    }
  }
@@ -163,13 +164,13 @@ size_t BuddyAllocator::Used() { return total_used_; }
 void* BuddyAllocator::SystemAlloc(size_t size) {
  size_t index = 0;
-  void* p = system_allocator_->Alloc(index, size);
+  void* p = system_allocator_->Alloc(&index, size);
  VLOG(10) << "Allocated " << p << " from system allocator.";
  if (p == nullptr) return nullptr;
-  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
                                     size, nullptr, nullptr);
  return static_cast<MemoryBlock*>(p)->data();
@@ -187,14 +188,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
  // Allocate a new maximum sized block
  size_t index = 0;
-  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+  void* p = system_allocator_->Alloc(&index, max_chunk_size_);
  if (p == nullptr) return pool_.end();
  VLOG(10) << "Creating and inserting new block " << p
           << " from system allocator";
-  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
                                     max_chunk_size_, nullptr, nullptr);
  // gpu fallback allocation
@@ -238,11 +239,11 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
  VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
           << ") into";
-  block->split(cache_, size);
+  block->split(&cache_, size);
  VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
           << ")";
-  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+  block->set_type(&cache_, MemoryBlock::ARENA_CHUNK);
  // the rest of memory if exist
  if (block->has_right_buddy(cache_)) {

--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -14,18 +14,18 @@ limitations under the License. */
 #pragma once
-#include "paddle/fluid/memory/detail/meta_cache.h"
+#include <mutex>  // NOLINT
-#include "paddle/fluid/memory/detail/meta_data.h"
+#include <set>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
-#include <mutex>
-#include <set>
-#include <unordered_map>
-#include <vector>
 namespace paddle {
 namespace memory {
 namespace detail {

--- a/paddle/fluid/memory/detail/memory_block.cc
+++ b/paddle/fluid/memory/detail/memory_block.cc
@@ -13,143 +13,142 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_cache.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
 #include "paddle/fluid/platform/assert.h"
 namespace paddle {
 namespace memory {
 namespace detail {
-void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+void MemoryBlock::init(MetadataCache* cache, Type t, size_t index, size_t size,
                       void* left_buddy, void* right_buddy) {
-  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
+  cache->save(
-                             static_cast<MemoryBlock*>(left_buddy),
+      this, MemoryBlock::Desc(t, index, size - sizeof(MemoryBlock::Desc), size,
-                             static_cast<MemoryBlock*>(right_buddy)));
+                              static_cast<MemoryBlock*>(left_buddy),
+                              static_cast<MemoryBlock*>(right_buddy)));
 }
-MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+MemoryBlock::Type MemoryBlock::type(const MetadataCache& cache) const {
  return cache.load(this).type;
 }
-size_t MemoryBlock::size(MetadataCache& cache) const {
+size_t MemoryBlock::size(const MetadataCache& cache) const {
  return cache.load(this).size;
 }
-size_t MemoryBlock::total_size(MetadataCache& cache) const {
+size_t MemoryBlock::index(const MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+size_t MemoryBlock::total_size(const MetadataCache& cache) const {
  return cache.load(this).total_size;
 }
-MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+bool MemoryBlock::has_left_buddy(const MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+bool MemoryBlock::has_right_buddy(const MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+MemoryBlock* MemoryBlock::left_buddy(const MetadataCache& cache) const {
  return cache.load(this).left_buddy;
 }
-MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+MemoryBlock* MemoryBlock::right_buddy(const MetadataCache& cache) const {
  return cache.load(this).right_buddy;
 }
-void MemoryBlock::split(MetadataCache& cache, size_t size) {
+void MemoryBlock::split(MetadataCache* cache, size_t size) {
  // make sure the split fits
-  PADDLE_ASSERT(total_size(cache) >= size);
+  PADDLE_ASSERT(total_size(*cache) >= size);
  // bail out if there is no room for another partition
-  if (total_size(cache) - size <= sizeof(Metadata)) {
+  if (total_size(*cache) - size <= sizeof(MemoryBlock::Desc)) {
    return;
  }
  // find the position of the split
  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
-  size_t remaining_size = total_size(cache) - size;
+  size_t remaining_size = total_size(*cache) - size;
  // Add the new block as a buddy
-  auto metadata = cache.load(this);
+  auto metadata = cache->load(this);
  // Write the metadata for the new block
  auto new_block_right_buddy = metadata.right_buddy;
-  cache.store(
+  cache->save(static_cast<MemoryBlock*>(right_partition),
-      static_cast<MemoryBlock*>(right_partition),
+              MemoryBlock::Desc(FREE_CHUNK, index(*cache),
-      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
+                                remaining_size - sizeof(MemoryBlock::Desc),
-               remaining_size, this, new_block_right_buddy));
+                                remaining_size, this, new_block_right_buddy));
  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
-  metadata.size = size - sizeof(Metadata);
+  metadata.size = size - sizeof(MemoryBlock::Desc);
  metadata.total_size = size;
-  cache.store(this, metadata);
+  cache->save(this, metadata);
  // Write metadata for the new block's right buddy
  if (new_block_right_buddy != nullptr) {
-    auto buddy_metadata = cache.load(new_block_right_buddy);
+    auto buddy_metadata = cache->load(new_block_right_buddy);
    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
-    cache.store(new_block_right_buddy, buddy_metadata);
+    cache->save(new_block_right_buddy, buddy_metadata);
  }
 }
-void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+void MemoryBlock::merge(MetadataCache* cache, MemoryBlock* right_buddy) {
  // only free blocks can be merged
-  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(type(*cache) == FREE_CHUNK);
-  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(*cache) == FREE_CHUNK);
-  auto metadata = cache.load(this);
+  auto metadata = cache->load(this);
  // link this->buddy's buddy
-  metadata.right_buddy = right_buddy->right_buddy(cache);
+  metadata.right_buddy = right_buddy->right_buddy(*cache);
  // link buddy's buddy -> this
  if (metadata.right_buddy != nullptr) {
-    auto buddy_metadata = cache.load(metadata.right_buddy);
+    auto buddy_metadata = cache->load(metadata.right_buddy);
    buddy_metadata.left_buddy = this;
-    cache.store(metadata.right_buddy, buddy_metadata);
+    cache->save(metadata.right_buddy, buddy_metadata);
  }
-  metadata.size += right_buddy->total_size(cache);
+  metadata.size += right_buddy->total_size(*cache);
-  metadata.total_size += right_buddy->total_size(cache);
+  metadata.total_size += right_buddy->total_size(*cache);
-  cache.store(this, metadata);
+  cache->save(this, metadata);
-  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
+  cache->save(right_buddy,
+              MemoryBlock::Desc(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
 }
-void MemoryBlock::mark_as_free(MetadataCache& cache) {
+void MemoryBlock::mark_as_free(MetadataCache* cache) {
  // check for double free or corruption
-  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(*cache) != FREE_CHUNK);
-  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
+  PADDLE_ASSERT(type(*cache) != INVALID_CHUNK);
  set_type(cache, FREE_CHUNK);
 }
-void MemoryBlock::set_type(MetadataCache& cache, Type t) {
+void MemoryBlock::set_type(MetadataCache* cache, Type t) {
-  auto metadata = cache.load(this);
+  auto metadata = cache->load(this);
  metadata.type = t;
+  cache->save(this, metadata);
-  cache.store(this, metadata);
-}
-bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
-  return left_buddy(cache) != nullptr;
-}
-bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
-  return right_buddy(cache) != nullptr;
-}
-size_t MemoryBlock::index(MetadataCache& cache) const {
-  return cache.load(this).index;
 }
 void* MemoryBlock::data() const {
-  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
+  return const_cast<MemoryBlock::Desc*>(
+             reinterpret_cast<const MemoryBlock::Desc*>(this)) +
+         1;
 }
 MemoryBlock* MemoryBlock::metadata() const {
  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
-      reinterpret_cast<const Metadata*>(this) - 1));
+      reinterpret_cast<const MemoryBlock::Desc*>(this) - 1));
 }
 }  // namespace detail

--- a/paddle/fluid/memory/detail/memory_block.h
+++ b/paddle/fluid/memory/detail/memory_block.h
@@ -11,21 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include <cstddef>
+#include <cstdint>
+#include <unordered_map>
 namespace paddle {
 namespace memory {
 namespace detail {
-// Forward Declarations
+// Forward declaration.
 class MetadataCache;
-/*! \brief A class used to interpret the contents of a memory block */
+// MemoryBlock represents Each allocated memory block, which contains
-class MemoryBlock {
+// MemoryBlock::Desc and the payload.
- public:
+struct MemoryBlock {
  enum Type {
    FREE_CHUNK,    // memory is free and idle
    ARENA_CHUNK,   // memory is being occupied
@@ -33,57 +33,96 @@ class MemoryBlock {
    INVALID_CHUNK  // memory is invalid
  };
- public:
+  // init saves the MemoryBlock::Desc of the memory block in a MetadataCache.
-  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+  // If it is a CPU memory block, the MetadataCache writes the
+  // MemoryBlock::Desc to the beginning of the block; or, if it is a GPU memory
+  // block, the MetadataCache writes the Meatadata to a std::map in
+  // the CPU.
+  void init(MetadataCache* cache, Type t, size_t index, size_t size,
            void* left_buddy, void* right_buddy);
- public:
+  // All these accessors returns fields in the MemoryBlock::Desc of the memory
-  /*! \brief The type of the allocation */
+  // block.  They all need a MetadataCache instance as their first
-  Type type(MetadataCache& cache) const;
+  // parameter because they read the MemoryBlock::Desc from the cache.
+  Type type(const MetadataCache& cache) const;
-  /*! \brief The size of the data region */
+  size_t size(const MetadataCache& cache) const;
-  size_t size(MetadataCache& cache) const;
+  size_t index(const MetadataCache& cache) const;
+  size_t total_size(const MetadataCache& cache) const;
+  bool has_left_buddy(const MetadataCache& cache) const;
+  bool has_right_buddy(const MetadataCache& cache) const;
+  MemoryBlock* left_buddy(const MetadataCache& cache) const;
+  MemoryBlock* right_buddy(const MetadataCache& cache) const;
-  /*! \brief An index to track the allocator */
+  // Split the allocation into left/right blocks.
-  size_t index(MetadataCache& cache) const;
+  void split(MetadataCache* cache, size_t size);
-  /*! \brief The total size of the block */
+  // Merge left and right blocks together.
-  size_t total_size(MetadataCache& cache) const;
+  void merge(MetadataCache* cache, MemoryBlock* right_buddy);
-  /*! \brief Check the left buddy of the block */
+  // Mark the allocation as free.
-  bool has_left_buddy(MetadataCache& cache) const;
+  void mark_as_free(MetadataCache* cache);
-  /*! \brief Check the right buddy of the block */
+  // Change the type of the allocation.
-  bool has_right_buddy(MetadataCache& cache) const;
+  void set_type(MetadataCache* cache, Type t);
-  /*! \brief Get the left buddy */
-  MemoryBlock* left_buddy(MetadataCache& cache) const;
-  /*! \brief Get the right buddy */
-  MemoryBlock* right_buddy(MetadataCache& cache) const;
- public:
-  /*! \brief Split the allocation into left/right blocks */
-  void split(MetadataCache& cache, size_t size);
-  /*! \brief Merge left and right blocks together */
-  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
-  /*! \brief Mark the allocation as free */
-  void mark_as_free(MetadataCache& cache);
-  /*! \brief Change the type of the allocation */
-  void set_type(MetadataCache& cache, Type t);
- public:
-  /*! \brief Get a pointer to the memory block's data */
  void* data() const;
-  /*! \brief Get a pointer to the memory block's metadata */
  MemoryBlock* metadata() const;
+  // MemoryBlock::Desc describes a MemoryBlock.
+  struct Desc {
+    Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+         MemoryBlock* r);
+    Desc();
+    // Updates guard_begin and guard_end by hashes of the Metadata object.
+    void update_guards();
+    // Checks that guard_begin and guard_end are hashes of the Metadata object.
+    bool check_guards() const;
+    // TODO(gangliao): compress this
+    size_t guard_begin = 0;
+    MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK;
+    size_t index = 0;
+    size_t size = 0;
+    size_t total_size = 0;
+    MemoryBlock* left_buddy = nullptr;
+    MemoryBlock* right_buddy = nullptr;
+    size_t guard_end = 0;
+  };
+};
+// A cache for accessing memory block meta-data that may be expensive
+// to access directly.  This class exists to unify the
+// MemoryBlock::Desc format between GPU and CPU allocations. It should
+// be removed when the CPU can access all GPU allocations directly via
+// UVM.
+class MetadataCache {
 public:
-  static size_t overhead();
+  explicit MetadataCache(bool uses_gpu);
+  // Disable copying and assignment.
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+  // Returns the MemoryBlock::Desc for a memory block.  When MetadataCache is
+  // used to manage CPU memory, the MemoryBlock::Desc resides at the beginning
+  // of the memory block; when used to manage GPU memory, the
+  // Meatadata resides in CPU memory indexed by cache_.
+  MemoryBlock::Desc load(const MemoryBlock* memory_block) const;
+  // Saves the MemoryBlock::Desc of a memory block into the cache.  For CPU
+  // memory block, writes the MemoryBlock::Desc to the beginning of the memory
+  // block; whereas for GPU memory, writes it to cache_.
+  void save(MemoryBlock* memory_block, const MemoryBlock::Desc& meta_data);
+  // For GPU memory block, erases its MemoryBlock::Desc from cache_.
+  void invalidate(MemoryBlock* memory_block);
+ private:
+  typedef std::unordered_map<const MemoryBlock*, MemoryBlock::Desc> MetadataMap;
+  MetadataMap cache_;
+  bool uses_gpu_;
 };
 }  // namespace detail

--- a/paddle/fluid/memory/detail/meta_data.cc
+++ b/paddle/fluid/memory/detail/meta_data.cc
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/memory/detail/meta_data.h"
 #include <functional>
+#include "paddle/fluid/memory/detail/memory_block.h"
 namespace paddle {
 namespace memory {
 namespace detail {
-Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+MemoryBlock::Desc::Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
-                   MemoryBlock* l, MemoryBlock* r)
+                        MemoryBlock* l, MemoryBlock* r)
    : type(t),
      index(i),
      size(s),
@@ -29,7 +29,7 @@ Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
      left_buddy(l),
      right_buddy(r) {}
-Metadata::Metadata()
+MemoryBlock::Desc::Desc()
    : type(MemoryBlock::INVALID_CHUNK),
      index(0),
      size(0),
@@ -37,32 +37,36 @@ Metadata::Metadata()
      left_buddy(nullptr),
      right_buddy(nullptr) {}
+namespace {
 template <class T>
-inline void hash_combine(std::size_t& seed, const T& v) {
+inline void hash_combine(std::size_t* seed, const T& v) {
  std::hash<T> hasher;
-  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  (*seed) ^= hasher(v) + 0x9e3779b9 + ((*seed) << 6) + ((*seed) >> 2);
 }
-inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+inline size_t hash(const MemoryBlock::Desc& metadata, size_t initial_seed) {
  size_t seed = initial_seed;
-  hash_combine(seed, (size_t)metadata->type);
+  hash_combine(&seed, static_cast<size_t>(metadata.type));
-  hash_combine(seed, metadata->index);
+  hash_combine(&seed, metadata.index);
-  hash_combine(seed, metadata->size);
+  hash_combine(&seed, metadata.size);
-  hash_combine(seed, metadata->total_size);
+  hash_combine(&seed, metadata.total_size);
-  hash_combine(seed, metadata->left_buddy);
+  hash_combine(&seed, metadata.left_buddy);
-  hash_combine(seed, metadata->right_buddy);
+  hash_combine(&seed, metadata.right_buddy);
  return seed;
 }
-void Metadata::update_guards() {
+}  // namespace
-  guard_begin = hash(this, 1);
-  guard_end = hash(this, 2);
+void MemoryBlock::Desc::update_guards() {
+  guard_begin = hash(*this, 1);
+  guard_end = hash(*this, 2);
 }
-bool Metadata::check_guards() const {
+bool MemoryBlock::Desc::check_guards() const {
-  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+  return guard_begin == hash(*this, 1) && guard_end == hash(*this, 2);
 }
 }  // namespace detail

--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/memory/detail/meta_cache.h"
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/platform/assert.h"
@@ -23,29 +22,28 @@ namespace detail {
 MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
-Metadata MetadataCache::load(const MemoryBlock* block) {
+MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
  if (uses_gpu_) {
-    auto existing_metadata = cache_.find(block);
+    auto existing_desc = cache_.find(block);
-    PADDLE_ASSERT(existing_metadata->second.check_guards());
+    PADDLE_ASSERT(existing_desc->second.check_guards());
-    return existing_metadata->second;
+    return existing_desc->second;
  } else {
-    auto* meta = reinterpret_cast<const Metadata*>(block);
+    auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block);
-    VLOG(10) << "Load MetaData type=" << meta->type;
+    VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type;
-    PADDLE_ASSERT(meta->check_guards());
+    PADDLE_ASSERT(desc->check_guards());
-    return *reinterpret_cast<const Metadata*>(block);
+    return *reinterpret_cast<const MemoryBlock::Desc*>(block);
  }
 }
-void MetadataCache::store(MemoryBlock* block,
+void MetadataCache::save(MemoryBlock* block,
-                          const Metadata& original_metadata) {
+                         const MemoryBlock::Desc& original_desc) {
-  auto metadata = original_metadata;
+  auto desc = original_desc;
+  desc.update_guards();
-  metadata.update_guards();
  if (uses_gpu_) {
-    cache_[block] = metadata;
+    cache_[block] = desc;
  } else {
-    *reinterpret_cast<Metadata*>(block) = metadata;
+    *reinterpret_cast<MemoryBlock::Desc*>(block) = desc;
  }
 }

--- a/paddle/fluid/memory/detail/meta_cache.h
+++ b/paddle/fluid/memory/detail/meta_cache.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
-#include <unordered_map>
-namespace paddle {
-namespace memory {
-namespace detail {
-/**
- *  \brief A cache for accessing memory block meta-data that may be expensive
- *         to access directly.
- *
- *  \note  This class exists to unify the metadata format between GPU and CPU
- *         allocations. It should be removed when the CPU can access all GPU
- *         allocations directly via UVM.
- */
-class MetadataCache {
- public:
-  explicit MetadataCache(bool uses_gpu);
- public:
-  /*! \brief Load the associated metadata for the specified memory block. */
-  Metadata load(const MemoryBlock* memory_block);
-  /*! \brief Store the associated metadata for the specified memory block. */
-  void store(MemoryBlock* memory_block, const Metadata& meta_data);
-  /*! \brief Indicate that the specified metadata will no longer be used. */
-  void invalidate(MemoryBlock* memory_block);
- public:
-  MetadataCache(const MetadataCache&) = delete;
-  MetadataCache& operator=(const MetadataCache&) = delete;
- private:
-  bool uses_gpu_;
- private:
-  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
- private:
-  MetadataMap cache_;
-};
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/detail/meta_data.h
+++ b/paddle/fluid/memory/detail/meta_data.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include <stddef.h>
-namespace paddle {
-namespace memory {
-namespace detail {
-class Metadata {
- public:
-  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
-           MemoryBlock* r);
-  Metadata();
- public:
-  /*! \brief Update the guards when metadata is changed */
-  void update_guards();
-  /*! \brief Check consistency to previous modification */
-  bool check_guards() const;
- public:
-  // TODO(gangliao): compress this
-  // clang-format off
-  size_t            guard_begin = 0;
-  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
-  size_t            index       = 0;
-  size_t            size        = 0;
-  size_t            total_size  = 0;
-  MemoryBlock*      left_buddy  = nullptr;
-  MemoryBlock*      right_buddy = nullptr;
-  size_t            guard_end   = 0;
-  // clang-format on
-};
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
-#include "paddle/fluid/platform/assert.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
 #include <algorithm>   // for std::max
 #include "gflags/gflags.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
@@ -35,13 +35,13 @@ namespace paddle {
 namespace memory {
 namespace detail {
-void* CPUAllocator::Alloc(size_t& index, size_t size) {
+void* CPUAllocator::Alloc(size_t* index, size_t size) {
  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
  // malloc might not return nullptr if size is zero, but the returned
  // pointer shall not be dereferenced -- so we make it nullptr.
  if (size <= 0) return nullptr;
-  index = 0;  // unlock memory
+  *index = 0;  // unlock memory
  void* p;
@@ -56,7 +56,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
  if (p != nullptr) {
    if (FLAGS_use_pinned_memory) {
-      index = 1;
+      *index = 1;
      mlock(p, size);  // lock memory
    }
  }
@@ -75,7 +75,7 @@ bool CPUAllocator::UseGpu() const { return false; }
 #ifdef PADDLE_WITH_CUDA
-void* GPUAllocator::Alloc(size_t& index, size_t size) {
+void* GPUAllocator::Alloc(size_t* index, size_t size) {
  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
  // if size is 0.  We just make sure it does.
  if (size <= 0) return nullptr;
@@ -93,7 +93,7 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
  }
  if (result == cudaSuccess) {
-    index = 0;
+    *index = 0;
    gpu_alloc_size_ += size;
    return p;
  } else {
@@ -133,7 +133,7 @@ bool GPUAllocator::UseGpu() const { return true; }
 // PINNED memory allows direct DMA transfers by the GPU to and from system
 // memory. It’s locked to a physical address.
-void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
+void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
  if (size <= 0) return nullptr;
  // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
@@ -154,7 +154,7 @@ void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
  cudaError_t result = cudaMallocHost(&p, size);
  if (result == cudaSuccess) {
-    index = 1;  // PINNED memory
+    *index = 1;  // PINNED memory
    cuda_pinnd_alloc_size_ += size;
    return p;
  } else {

--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -29,14 +29,14 @@ namespace detail {
 class SystemAllocator {
 public:
  virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void* Alloc(size_t* index, size_t size) = 0;
  virtual void Free(void* p, size_t size, size_t index) = 0;
  virtual bool UseGpu() const = 0;
 };
 class CPUAllocator : public SystemAllocator {
 public:
-  virtual void* Alloc(size_t& index, size_t size);
+  virtual void* Alloc(size_t* index, size_t size);
  virtual void Free(void* p, size_t size, size_t index);
  virtual bool UseGpu() const;
 };
@@ -46,7 +46,7 @@ class GPUAllocator : public SystemAllocator {
 public:
  explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
-  virtual void* Alloc(size_t& index, size_t size);
+  virtual void* Alloc(size_t* index, size_t size);
  virtual void Free(void* p, size_t size, size_t index);
  virtual bool UseGpu() const;
@@ -58,7 +58,7 @@ class GPUAllocator : public SystemAllocator {
 class CUDAPinnedAllocator : public SystemAllocator {
 public:
-  virtual void* Alloc(size_t& index, size_t size);
+  virtual void* Alloc(size_t* index, size_t size);
  virtual void Free(void* p, size_t size, size_t index);
  virtual bool UseGpu() const;

--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -22,11 +22,11 @@ limitations under the License. */
 DECLARE_bool(use_pinned_memory);
-void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
+void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
  bool freed = false;
  {
    size_t index;
-    void* p = a.Alloc(index, size);
+    void* p = a->Alloc(&index, size);
    if (size > 0) {
      EXPECT_NE(p, nullptr);
    } else {
@@ -36,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
    int* i = static_cast<int*>(p);
    std::shared_ptr<int> ptr(i, [&](void* p) {
      freed = true;
-      a.Free(p, size, index);
+      a->Free(p, size, index);
    });
  }
  EXPECT_TRUE(freed);
@@ -45,21 +45,21 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
 TEST(CPUAllocator, NoLockMem) {
  FLAGS_use_pinned_memory = false;
  paddle::memory::detail::CPUAllocator a;
-  TestAllocator(a, 2048);
+  TestAllocator(&a, 2048);
-  TestAllocator(a, 0);
+  TestAllocator(&a, 0);
 }
 TEST(CPUAllocator, LockMem) {
  FLAGS_use_pinned_memory = true;
  paddle::memory::detail::CPUAllocator a;
-  TestAllocator(a, 2048);
+  TestAllocator(&a, 2048);
-  TestAllocator(a, 0);
+  TestAllocator(&a, 0);
 }
 #ifdef PADDLE_WITH_CUDA
 TEST(GPUAllocator, Alloc) {
  paddle::memory::detail::GPUAllocator a(0);
-  TestAllocator(a, 2048);
+  TestAllocator(&a, 2048);
-  TestAllocator(a, 0);
+  TestAllocator(&a, 0);
 }
 #endif
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/memory/malloc.h"
 #include "glog/logging.h"

--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace memory {
+/**
+ * \brief   Allocate memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  size   Allocation size.
+ *
+ * \return  Allocated memory block address.
+ *
+ * \note    If return nullptr, it indicates memory allocation failed
+ *          because insufficient memory in current system. When Alloc
+ *          function is invoked, you must check the returned memory
+ *          address is valid or not.
+ */
+template <typename Place>
+void* Alloc(Place place, size_t size);
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  ptr    Memory block address to free.
+ *
+ */
+template <typename Place>
+void Free(Place place, void* ptr);
+/**
+ * \brief   Total size of used memory in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ *
+ */
+template <typename Place>
+size_t Used(Place place);
+struct Usage : public boost::static_visitor<size_t> {
+  size_t operator()(const platform::CPUPlace& cpu) const;
+  size_t operator()(const platform::CUDAPlace& gpu) const;
+  size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
+};
+size_t memory_usage(const platform::Place& p);
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PODDeleter {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+ public:
+  explicit PODDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
+ private:
+  Place place_;
+};
+/**
+ * \brief   Free memory block in one place does not meet POD
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PlainDeleter {
+ public:
+  explicit PlainDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
+ private:
+  Place place_;
+};
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/memory/malloc.h"
 #include <unordered_map>
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
@@ -28,7 +27,7 @@ inline bool is_aligned(void const *p) {
 }
 size_t align(size_t size, paddle::platform::CPUPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
+  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
  size_t alignment = paddle::platform::CpuMinChunkSize();
  size_t remaining = size % alignment;
  return remaining == 0 ? size : size + (alignment - remaining);
@@ -86,7 +85,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
 #ifdef PADDLE_WITH_CUDA
 size_t align(size_t size, paddle::platform::CUDAPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
+  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
  size_t alignment = paddle::platform::GpuMinChunkSize();
  size_t remaining = size % alignment;
  return remaining == 0 ? size : size + (alignment - remaining);
@@ -142,7 +141,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
 }
 size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
+  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
  size_t alignment = paddle::platform::CUDAPinnedMinChunkSize();
  size_t remaining = size % alignment;
  return remaining == 0 ? size : size + (alignment - remaining);

--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@@ -14,91 +14,5 @@ limitations under the License. */
 #pragma once
-#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
-namespace paddle {
-namespace memory {
-/**
- * \brief   Allocate memory block in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- * \param[in]  size   Allocation size.
- *
- * \return  Allocated memory block address.
- *
- * \note    If return nullptr, it indicates memory allocation failed
- *          because insufficient memory in current system. When Alloc
- *          function is invoked, you must check the returned memory
- *          address is valid or not.
- */
-template <typename Place>
-void* Alloc(Place place, size_t size);
-/**
- * \brief   Free memory block in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- * \param[in]  ptr    Memory block address to free.
- *
- */
-template <typename Place>
-void Free(Place place, void* ptr);
-/**
- * \brief   Total size of used memory in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- *
- */
-template <typename Place>
-size_t Used(Place place);
-struct Usage : public boost::static_visitor<size_t> {
-  size_t operator()(const platform::CPUPlace& cpu) const;
-  size_t operator()(const platform::CUDAPlace& gpu) const;
-  size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
-};
-size_t memory_usage(const platform::Place& p);
-/**
- * \brief   Free memory block in one place.
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *
- */
-template <typename T, typename Place>
-class PODDeleter {
-  static_assert(std::is_pod<T>::value, "T must be POD");
- public:
-  explicit PODDeleter(Place place) : place_(place) {}
-  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
- private:
-  Place place_;
-};
-/**
- * \brief   Free memory block in one place does not meet POD
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *
- */
-template <typename T, typename Place>
-class PlainDeleter {
- public:
-  explicit PlainDeleter(Place place) : place_(place) {}
-  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
- private:
-  Place place_;
-};
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/pinned_memory_test.cu
+++ b/paddle/fluid/memory/pinned_memory_test.cu
@@ -15,7 +15,6 @@ limitations under the License. */
 #include <unordered_map>
 #include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -263,7 +263,7 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
-cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
+cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)

--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -13,8 +13,8 @@
   limitations under the License. */
 #include "mkldnn.hpp"
-#include "mkldnn_activation_op.h"
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/mkldnn_activation_op.h"
 namespace paddle {
 namespace operators {
@@ -40,18 +40,24 @@ void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
  const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
  // get memory dim
-  PADDLE_ENFORCE(src->dims().size() == 4,
+  PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4,
-                 "Input dim must be with 4, i.e. NCHW");
+                 "Input dim must be with 2 or 4");
  std::vector<int> src_tz = framework::vectorize2int(src->dims());
  // create memory description
-  // TODO(kbinias-intel): support more formats
+  auto data_md = src_tz.size() == 2
-  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                     ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                         mkldnn::memory::format::nchw);
+                                               mkldnn::memory::format::nc)
+                     : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                               mkldnn::memory::format::nchw);
  // create memory primitives
-  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data);
+  auto src_memory =
-  auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data);
+      mkldnn::memory({data_md, mkldnn_engine},
+                     static_cast<void *>(const_cast<float *>(src_data)));
+  auto dst_memory =
+      mkldnn::memory({data_md, mkldnn_engine},
+                     static_cast<void *>(const_cast<float *>(dst_data)));
  auto forward_desc = mkldnn::eltwise_forward::desc(
      mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
@@ -91,15 +97,21 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
  std::vector<int> src_tz = framework::vectorize2int(x->dims());
  // create memory description
-  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+  auto data_md = src_tz.size() == 2
-                                         mkldnn::memory::format::nchw);
+                     ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                               mkldnn::memory::format::nc)
+                     : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                               mkldnn::memory::format::nchw);
  // create memory primitives
-  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src);
+  auto src_memory = mkldnn::memory(
+      {data_md, mkldnn_engine}, static_cast<void *>(const_cast<float *>(src)));
  auto diff_src_memory =
-      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src);
+      mkldnn::memory({data_md, mkldnn_engine},
+                     static_cast<void *>(const_cast<float *>(diff_src)));
  auto diff_dst_memory =
-      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst);
+      mkldnn::memory({data_md, mkldnn_engine},
+                     static_cast<void *>(const_cast<float *>(diff_dst)));
  auto backward_desc =
      mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -662,14 +662,3 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
                                ops::grad_functor<double>>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
-REGISTER_OP_CPU_KERNEL(relu,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ReluFunctor<float>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ReluFunctor<double>>);
-REGISTER_OP_CPU_KERNEL(
-    relu_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                                         ops::ReluGradFunctor<float>>,
-    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                              ops::ReluGradFunctor<double>>);
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,31 +14,19 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
-#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)   \
-  REGISTER_OP_CUDA_KERNEL(                                                 \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)    \
-      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
+  REGISTER_OP_CUDA_KERNEL(                                                  \
-                                      ops::functor<float>>,                \
+      act_type,                                                             \
-      ops::ActivationKernel<paddle::platform::CUDADeviceContext,           \
+      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<float>>,  \
-                            ops::functor<double>>);                        \
+      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<double>>, \
-  REGISTER_OP_CUDA_KERNEL(                                                 \
+      ops::ActivationKernel<plat::CUDADeviceContext,                        \
-      act_type##_grad,                                                     \
+                            ops::functor<plat::float16>>);                  \
-      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
+  REGISTER_OP_CUDA_KERNEL(                                                  \
-                                ops::grad_functor<float>>,                 \
+      act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
-      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
+                                                 ops::grad_functor<float>>, \
+      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
                                ops::grad_functor<double>>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
-REGISTER_OP_CUDA_KERNEL(
-    relu, ops::ActivationKernel<paddle::platform::CUDADeviceContext,
-                                ops::ReluFunctor<float>>,
-    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
-                          ops::ReluFunctor<double>>,
-    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
-                          ops::ReluFunctor<paddle::platform::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad, ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
-                                         ops::ReluGradFunctor<float>>,
-    ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::ReluGradFunctor<double>>);
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,9 +10,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/float16.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -336,11 +337,25 @@ struct Sine {
  HOSTDEVICE T operator()(const T& val) const { return sin(val); }
 };
+template <>
+struct Sine<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(sin(static_cast<float>(val)));
+  }
+};
 template <typename T>
 struct Cosine {
  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
 };
+template <>
+struct Cosine<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(cos(static_cast<float>(val)));
+  }
+};
 // cosine'(x) = -sin(x)
 template <typename T>
 struct CosGradFunctor : public BaseActivationFunctor<T> {
@@ -824,6 +839,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
+  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \

--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/adagrad_op.h"
+#include <vector>
 #include <cmath>

--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"

--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 #include "paddle/fluid/operators/assign_value_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"

--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/auc_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -40,7 +42,7 @@ class AucKernel : public framework::OpKernel<T> {
    std::vector<float> thresholds_list;
    thresholds_list.reserve(num_thresholds);
    for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = (float)i / (num_thresholds - 1);
+      thresholds_list[i] = static_cast<float>(i) / (num_thresholds - 1);
    }
    const float kEpsilon = 1e-7;
    thresholds_list[0] = 0.0f - kEpsilon;
@@ -105,11 +107,12 @@ class AucKernel : public framework::OpKernel<T> {
    float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
    float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
    for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] =
+      tp_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
-          ((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon);
+                        (tp_data[i] + fn_data[i] + epsilon);
-      fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon);
+      fp_rate_data[i] =
-      rec_rate_data[i] =
+          static_cast<float>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
-          ((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon);
+      rec_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+                         (tp_data[i] + fp_data[i] + epsilon);
    }
    *auc_data = 0.0f;
    if (curve == "ROC") {

--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -19,15 +19,15 @@ namespace operators {
 template <>
 void GetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t& num_updates_,
+    const framework::ExecutionContext& ctx, int64_t* num_updates_,
-    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
-  old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
+  *old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
-  num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
+  *num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
-  num_updates_ = in_num_updates->data<int64_t>()[0];
+  *num_updates_ = in_num_updates->data<int64_t>()[0];
 }
 template <>

--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -19,18 +19,18 @@ namespace paddle {
 namespace operators {
 template <>
 void GetAccumulators<paddle::platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t& num_updates_,
+    const framework::ExecutionContext& ctx, int64_t* num_updates_,
-    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
  auto stream = ctx.cuda_device_context().stream();
-  memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
+  memory::Copy(platform::CPUPlace(), old_num_accumulates_,
               platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
               sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
+  memory::Copy(platform::CPUPlace(), num_accumulates_, platform::CUDAPlace(),
               in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
+  memory::Copy(platform::CPUPlace(), num_updates_, platform::CUDAPlace(),
               in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
 }

--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -29,8 +29,8 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename DeviceContext>
 void GetAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t& num_updates, int64_t& num_accumulates,
+                     int64_t* num_updates, int64_t* num_accumulates,
-                     int64_t& old_num_accumulates);
+                     int64_t* old_num_accumulates);
 template <typename DeviceContext>
 void SetAccumulators(const framework::ExecutionContext& ctx,
@@ -47,8 +47,8 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
    int64_t num_updates = 0;
    int64_t num_accumulates = 0;
    int64_t old_num_accumulates = 0;
-    GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+    GetAccumulators<DeviceContext>(ctx, &num_updates, &num_accumulates,
-                                   old_num_accumulates);
+                                   &old_num_accumulates);
    // Get attrs
    float average_window = ctx.Attr<float>("average_window");

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include <string>
 #include "paddle/fluid/framework/data_layout.h"
 namespace paddle {

--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/framework/data_layout.h"
 #include <cfloat>
+#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"

--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/box_coder_op.h
+++ b/paddle/fluid/operators/box_coder_op.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/compare_op.h"
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {

--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/concat_op.h"
+#include <string>
 #include <vector>
 namespace paddle {

--- a/paddle/fluid/operators/cond_op.h
+++ b/paddle/fluid/operators/cond_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/ddim.h"

--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/conv_transpose_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/im2col.h"

--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"

--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -5,5 +5,5 @@ if(WITH_DISTRIBUTE)
  set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
  cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
      cares zlib protobuf sendrecvop_grpc)
-  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op)
 endif()
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -65,9 +65,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
 }
 void ProcGetResponse(const VarHandle& var_h,
-                     // const sendrecv::VariableMessage& ret_msg) {
                     const ::grpc::ByteBuffer& ret_msg) {
-  framework::Variable* outvar = NULL;
+  framework::Variable* outvar = nullptr;
  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
 }
@@ -138,7 +137,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
    auto* var = p_scope->FindVar(in_var_name_val);
    ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req);
+    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
    // var handle
    VarHandle var_h;

--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -138,39 +138,48 @@ class RequestPrefetch final : public RequestBase {
                           framework::Scope* scope,
                           const platform::DeviceContext* dev_ctx,
                           framework::Executor* executor,
-                           framework::ProgramDesc* program, int blkid)
+                           framework::ProgramDesc* program,
+                           framework::ExecutorPrepareContext* prefetch_ctx)
      : RequestBase(service, cq, dev_ctx),
        responder_(&ctx_),
        scope_(scope),
        executor_(executor),
        program_(program),
-        blkid_(blkid) {
+        prefetch_ctx_(prefetch_ctx) {
+    request_.reset(new VariableResponse(scope, dev_ctx_));
    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
-    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
+    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
-                                cq_, this);
+                                cq_, cq_, this);
  }
  virtual ~RequestPrefetch() {}
-  virtual std::string GetReqName() { return request_.varname(); }
+  virtual std::string GetReqName() { return request_->Varname(); }
  virtual void Process() {
    // prefetch process...
    ::grpc::ByteBuffer reply;
-    // TODO(Yancey1989): execute the Block which containers prefetch ops
-    VLOG(3) << "RequestPrefetch Process in";
+    std::string var_name = request_->OutVarname();
+    auto var_desc = program_->Block(0).FindVar(var_name);
+    framework::Scope* local_scope = &scope_->NewScope();
+    auto* var = local_scope->FindVar(var_name);
+    InitializeVariable(var, var_desc->GetType());
+    executor_->RunPreparedContext(prefetch_ctx_, scope_, false, false);
+    SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
    responder_.Finish(reply, ::grpc::Status::OK, this);
    status_ = FINISH;
  }
 protected:
-  sendrecv::VariableMessage request_;
+  std::shared_ptr<VariableResponse> request_;
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
  framework::Scope* scope_;
  framework::Executor* executor_;
  framework::ProgramDesc* program_;
+  framework::ExecutorPrepareContext* prefetch_ctx_;
  int blkid_;
 };
@@ -268,7 +277,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
  }
  RequestPrefetch* prefetch =
      new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_,
-                          executor_, program_, prefetch_blk_id_);
+                          executor_, program_, prefetch_ctx_);
  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
 }

--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -63,6 +63,10 @@ class AsyncGRPCServer final {
  void SetExecutor(framework::Executor *executor) { executor_ = executor; }
+  void SetPrefetchPreparedCtx(framework::ExecutorPrepareContext *prepared) {
+    prefetch_ctx_ = prepared;
+  }
  int GetSelectedPort() { return selected_port_; }
  const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
@@ -111,6 +115,7 @@ class AsyncGRPCServer final {
  std::unique_ptr<std::thread> t_prefetch_;
  int prefetch_blk_id_;
+  framework::ExecutorPrepareContext *prefetch_ctx_;
  framework::ProgramDesc *program_;
  framework::Executor *executor_;
  int selected_port_;

--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -20,43 +20,121 @@ limitations under the License. */
 #include "paddle/fluid/operators/detail/grpc_client.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace detail = paddle::operators::detail;
+USE_OP(lookup_table);
 std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
+framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
+  auto root_block = program->MutableBlock(0);
+  auto* block = program->AppendBlock(*root_block);
+  framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
+  framework::VariableNameMap output({{"Output", {"out"}}});
+  auto op = block->AppendOp();
+  op->SetType("lookup_table");
+  op->SetInput("W", {"w"});
+  op->SetInput("Ids", {"ids"});
+  op->SetOutput("Out", {"out"});
+  auto& out = *root_block->Var("out");
+  out.SetType(framework::proto::VarType::SELECTED_ROWS);
+  out.SetShape({10, 10});
+  return block;
+}
+void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
+  auto w_var = scope->Var("w");
+  w_var->GetMutable<framework::SelectedRows>();
+  auto out_var = scope->Var("out");
+  out_var->GetMutable<framework::SelectedRows>();
+  auto ids_var = scope->Var("ids");
+  ids_var->GetMutable<framework::SelectedRows>();
+}
+void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
+  auto rows = ids_var->mutable_rows();
+  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
+  ids_var->mutable_value()->Resize({rows_numel, 1});
+  ids_var->mutable_value()->mutable_data<float>(*place);
+}
+void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
+  auto rows = w->mutable_rows();
+  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
+  auto w_value = w->mutable_value();
+  w_value->Resize({rows_numel, 10});
+  auto ptr = w_value->mutable_data<float>(*place);
+  for (int64_t i = 0; i < w_value->numel(); ++i) {
+    ptr[i] = static_cast<float>(i / 10);
+  }
+}
 void StartServer(const std::string& endpoint) {
  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+  auto* block = AppendPrefetchBlcok(&program);
+  auto prepared = exe.Prepare(program, block->ID());
+  InitTensorsOnServer(&scope, &place, 10);
+  rpc_service_->SetProgram(&program);
+  rpc_service_->SetPrefetchPreparedCtx(prepared.get());
+  rpc_service_->SetDevCtx(&ctx);
+  rpc_service_->SetScope(&scope);
+  rpc_service_->SetExecutor(&exe);
  rpc_service_->RunSyncUpdate();
 }
 TEST(PREFETCH, CPU) {
  // start up a server instance backend
-  // TODO(Yancey1989): Need to start a server with optimize blocks and
-  // prefetch blocks.
  std::thread server_thread(StartServer, "127.0.0.1:8889");
+  sleep(2);
  framework::Scope scope;
  platform::CPUPlace place;
  platform::CPUDeviceContext ctx(place);
  // create var on local scope
-  std::string in_var_name("in");
+  int64_t rows_numel = 5;
+  InitTensorsOnClient(&scope, &place, rows_numel);
+  std::string in_var_name("ids");
  std::string out_var_name("out");
-  auto* in_var = scope.Var(in_var_name);
-  auto* in_tensor = in_var->GetMutable<framework::LoDTensor>();
-  in_tensor->Resize({10, 10});
-  VLOG(3) << "before mutable_data";
-  in_tensor->mutable_data<int>(place);
-  scope.Var(out_var_name);
-  VLOG(3) << "before fetch";
  detail::RPCClient client;
  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
                               out_var_name);
  client.Wait();
+  auto var = scope.Var(out_var_name);
+  auto value = var->GetMutable<framework::SelectedRows>()->value();
+  auto ptr = value.mutable_data<float>(place);
  rpc_service_->ShutDown();
  server_thread.join();
  rpc_service_.reset(nullptr);
+  for (int64_t i = 0; i < rows_numel; ++i) {
+    EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+  }
 }
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -21,7 +21,7 @@ service SendRecvService {
  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
  // Argument VariableMessage for GetVariable should only contain varname.
  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  // Prefetch variable by Ids
+  // pre-fetch variable by given variable name and Ids
  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 }
@@ -67,6 +67,8 @@ message VariableMessage {
  bytes serialized = 8;
  // selected_rows data
  bytes rows = 9;
+  // Look up table block execution output variable name.
+  string out_varname = 10;
 }
 message VoidMessage {}
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -30,11 +30,9 @@ namespace detail {
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg) {
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_name) {
  using VarMsg = sendrecv::VariableMessage;
-  sendrecv::VariableMessage request;
-  std::string header;
-  request.AppendToString(&header);
  // When using GPU, need to free the copied CPU buffer
  // when the ByteBuffer destroies
  // TODO(typhoonzero): add unref here, if we have dependent
@@ -52,6 +50,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
    e.WriteUint64(VarMsg::kTypeFieldNumber, 1);
  }
+  if (!out_name.empty()) {
+    e.WriteString(VarMsg::kOutVarnameFieldNumber, out_name);
+  }
  switch (framework::ToVarType(var->Type())) {
    case framework::proto::VarType_Type_LOD_TENSOR: {
      auto tensor = var->Get<framework::LoDTensor>();

--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -46,7 +46,8 @@ typedef void (*DestroyCallback)(void*);
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg);
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_varname = std::string());
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                               const platform::DeviceContext& ctx,

--- a/paddle/fluid/operators/detail/serde_test.cc
+++ b/paddle/fluid/operators/detail/serde_test.cc
@@ -107,7 +107,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
  for (int i = 0; i < tensor_numel; ++i) {
    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
  }
-  for (int64_t i = 0; i < rows2->size(); ++i) {
+  for (size_t i = 0; i < rows2->size(); ++i) {
    EXPECT_EQ(rows_data2[i], i);
  }
  EXPECT_EQ(slr2->height(), 1000);

--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -416,6 +416,20 @@ int VariableResponse::Parse(Source* source) {
        }
        break;
      }
+      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+        meta_.set_out_varname(temp);
+        break;
+      }
      default: {
        // Unknown tag, return unknown error.

--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -55,6 +55,7 @@ class VariableResponse {
  int Parse(const ::grpc::ByteBuffer& byte_buffer);
  inline std::string Varname() { return meta_.varname(); }
+  inline std::string OutVarname() { return meta_.out_varname(); }
  // should call parse first.
  framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); }

--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <algorithm>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/transform.h"
 #ifdef __NVCC__
+#include <cuda.h>
 #include <thrust/iterator/iterator_adaptor.h>
-#include "paddle/fluid/platform/cuda_helper.h"
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 #endif
@@ -43,35 +44,35 @@ namespace operators {
 */
 inline void get_mid_dims(const framework::DDim& x_dims,
                         const framework::DDim& y_dims, const int axis,
-                         int& pre, int& n, int& post) {
+                         int* pre, int* n, int* post) {
-  pre = 1;
+  *pre = 1;
-  n = 1;
+  *n = 1;
-  post = 1;
+  *post = 1;
  for (int i = 0; i < axis; ++i) {
-    pre *= x_dims[i];
+    (*pre) *= x_dims[i];
  }
  for (int i = 0; i < y_dims.size(); ++i) {
    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
                      "Broadcast dimension mismatch.");
-    n *= y_dims[i];
+    (*n) *= y_dims[i];
  }
  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    post *= x_dims[i];
+    (*post) *= x_dims[i];
  }
 }
-inline void trim_trailing_singular_dims(framework::DDim& dims) {
+inline void trim_trailing_singular_dims(framework::DDim* dims) {
  // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims.size();
+  auto actual_dims_size = dims->size();
  for (; actual_dims_size != 0; --actual_dims_size) {
-    if (dims[actual_dims_size - 1] != 1) break;
+    if ((*dims)[actual_dims_size - 1] != 1) break;
  }
-  if (actual_dims_size != dims.size()) {
+  if (actual_dims_size != dims->size()) {
-    auto actual_dims = framework::vectorize(dims);
+    auto actual_dims = framework::vectorize(*dims);
    actual_dims.resize(actual_dims_size);
-    dims = framework::make_ddim(actual_dims);
+    *dims = framework::make_ddim(actual_dims);
  }
 }
@@ -159,7 +160,7 @@ class RowwiseTransformIterator<T, platform::CUDADeviceContext>
      RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
      super_t;
  HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
-      : super_t(x), begin_(x), n_(n){};
+      : super_t(x), begin_(x), n_(n) {}
  friend class thrust::iterator_core_access;
 private:
@@ -179,7 +180,7 @@ class MidWiseTransformIterator<T, platform::CUDADeviceContext>
      MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
      super_t;
  HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
-      : super_t(x), begin_(x), n_(n), post_(post){};
+      : super_t(x), begin_(x), n_(n), post_(post) {}
  friend class thrust::iterator_core_access;
 private:
@@ -333,6 +334,55 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out,
  }
 }
 #ifdef __NVCC__
+// __shfl_down has been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
+  return __shfl_down(val, delta);
+}
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+template <typename T>
+__device__ T reduceSum(T val, int tid, int len) {
+  // TODO(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
+  __shared__ T shm[32];
+  const int warpSize = 32;
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, tid < len);
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
+    val += __shfl_down_sync(mask, val, offset);
+  if (tid < warpSize) shm[tid] = 0;
+  __syncthreads();
+  if (tid % warpSize == 0) {
+    shm[tid / warpSize] = val;
+  }
+  CREATE_SHFL_MASK(mask, tid < warpSize);
+  if (tid < warpSize) {
+    val = shm[tid];
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+      val += __shfl_down_sync(mask, val, offset);
+  }
+  return val;
+}
 template <typename T, typename DX_OP, typename DY_OP>
 static __global__ void ElemwiseGradBroadcast1CUDAKernel(
    const T* x, const T* y, const T* out, const T* dout, int h, int w,
@@ -355,7 +405,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
  if (dy) {
    h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-    val = platform::reduceSum(val, tid, h);
+    val = reduceSum(val, tid, h);
    if (threadIdx.x == 0) {
      dy[j] = val;
    }
@@ -432,7 +482,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
  if (dy) {
    int h = pre * post;
    h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-    val = platform::reduceSum(val, tid, h);
+    val = reduceSum(val, tid, h);
    if (threadIdx.x == 0) {
      dy[j] = val;
    }
@@ -472,11 +522,11 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
    auto y_dim = y.dims();
    axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
-    trim_trailing_singular_dims(y_dim);
+    trim_trailing_singular_dims(&y_dim);
    axis = (y_dim.size() == 0) ? x_dim.size() : axis;
    int pre, n, post;
-    get_mid_dims(x_dim, y_dim, axis, pre, n, post);
+    get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
    if (post == 1) {
      int h = pre;
      int w = n;
@@ -514,7 +564,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
      }
    }
  }
-};
+}
 template <typename DeviceContext, typename T, typename functor,
          typename broadcastfunctor, typename broadcast2functor>
@@ -543,11 +593,11 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
  }
  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  trim_trailing_singular_dims(y_dims);
+  trim_trailing_singular_dims(&y_dims);
  axis = (y_dims.size() == 0) ? x_dims.size() : axis;
  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
  if (post == 1) {
    broadcastfunctor f;
@@ -582,11 +632,11 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                 "Axis should be in range [0, x_dims)");
-  trim_trailing_singular_dims(y_dims);
+  trim_trailing_singular_dims(&y_dims);
  axis = (y_dims.size() == 0) ? x_dims.size() : axis;
  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
  if (post == 1) {
    functor.RunRowWise(n, pre);
    return;

--- a/paddle/fluid/operators/go_op.cc
+++ b/paddle/fluid/operators/go_op.cc
@@ -56,11 +56,11 @@ class GoOp : public framework::OperatorBase {
    // TODO(varunarora): Consider moving this root scope lookup to scope.h.
    const framework::Scope *root_scope = &scope;
-    const framework::Scope *parent_scope = &(root_scope->parent());
+    const framework::Scope *parent_scope = root_scope->parent();
    while (parent_scope != nullptr) {
      root_scope = parent_scope;
-      parent_scope = &(parent_scope->parent());
+      parent_scope = parent_scope->parent();
    }
    framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock);

--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 #pragma once
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -35,7 +37,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
      if (lod_t->lod().size() > 0) {
        auto y_lod = lod_t->lod();
        auto last_level = y_lod[y_lod.size() - 1];
-        PADDLE_ENFORCE_EQ(last_level.back(), in->dims()[0],
+        PADDLE_ENFORCE_EQ((int64_t)(last_level.back()), in->dims()[0],
                          "Last value of `Y`'s last level LoD should be equal "
                          "to the first dimension of `X`");
        out->set_lod(y_lod);

--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -39,13 +39,14 @@ void gemm<platform::CUDADeviceContext, float16>(
  cublasOperation_t cuTransB =
      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
  // TODO(kexinzhao): add processing code for compute capability < 53 case
  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
                    "cublas fp16 gemm requires GPU compute capability >= 53");
+#if CUDA_VERSION >= 8000
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
  if (context.GetComputeCapability() >= 70) {
@@ -56,7 +57,7 @@ void gemm<platform::CUDADeviceContext, float16>(
    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
                                                        CUBLAS_DEFAULT_MATH));
  }
-#endif
+#endif  // CUDA_VERSION >= 9000
  // cublasHgemm does true FP16 computation which is slow for non-Volta
  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
@@ -66,6 +67,18 @@ void gemm<platform::CUDADeviceContext, float16>(
      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
      CUDA_R_32F, algo));
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+  const half h_alpha = static_cast<const half>(alpha);
+  const half h_beta = static_cast<const half>(beta);
+  const half* h_A = reinterpret_cast<const half*>(A);
+  const half* h_B = reinterpret_cast<const half*>(B);
+  half* h_C = reinterpret_cast<half*>(C);
+  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
+      h_A, lda, &h_beta, h_C, N));
+#endif  // CUDA_VERSION >= 8000
 }
 template <>

--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -66,13 +66,7 @@ class ReadOp : public framework::OperatorBase {
    std::vector<std::string> out_arg_names = Outputs("Out");
    std::vector<framework::LoDTensor> ins;
    reader->ReadNext(&ins);
-    if (ins.empty()) {
+    PADDLE_ENFORCE(!ins.empty(), "There is no next data.");
-      reader->ReInit();
-      reader->ReadNext(&ins);
-      PADDLE_ENFORCE(
-          !ins.empty(),
-          "Reader can not read the next data even it has been re-initialized.");
-    }
    PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
    for (size_t i = 0; i < ins.size(); ++i) {
      auto* out =

--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -22,5 +22,6 @@ reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
 reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
+reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
 # Export local libraries to parent
 set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -63,13 +63,14 @@ class DoubleBufferReader : public framework::DecoratedReader {
    StartPrefetcher();
  }
-  bool HasNext() const override;
  void ReadNext(std::vector<framework::LoDTensor>* out) override;
  void ReInit() override;
  ~DoubleBufferReader() { EndPrefetcher(); }
 private:
+  bool HasNext() const;
  void StartPrefetcher() {
    channel_ = framework::MakeChannel<Item>(kChannelSize);
    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
@@ -109,7 +110,9 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
    auto place_str = Attr<std::string>("place");
    platform::Place place;
-    if (place_str == "CPU") {
+    if (place_str == "AUTO") {
+      place = dev_place;
+    } else if (place_str == "CPU") {
      place = platform::CPUPlace();
    } else {
      std::istringstream sin(place_str);
@@ -140,28 +143,22 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
      enum_range.insert(string::Sprintf("CUDA:%d", i));
    }
    enum_range.insert("CPU");
-    AddAttr<std::string>("place", "The double buffer place, default is CPU")
+    enum_range.insert("AUTO");
-        .SetDefault("CPU")
+    AddAttr<std::string>("place", "The double buffer place")
+        .SetDefault("AUTO")
        .InEnum({enum_range});
  }
 };
-bool DoubleBufferReader::HasNext() const {
-  while (!channel_->IsClosed() && !channel_->CanReceive()) {
-  }
-  return channel_->CanReceive();
-}
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  if (!HasNext()) {
+  out->clear();
-    PADDLE_THROW("There is no next data!");
+  if (HasNext()) {
-  }
+    Item batch;
+    channel_->Receive(&batch);
-  Item batch;
+    *out = batch.payloads_;
-  channel_->Receive(&batch);
+    if (batch.ctx_) {
-  *out = batch.payloads_;
+      batch.ctx_->Wait();
-  if (batch.ctx_) {
+    }
-    batch.ctx_->Wait();
  }
 }
@@ -171,16 +168,26 @@ void DoubleBufferReader::ReInit() {
  StartPrefetcher();
 }
+bool DoubleBufferReader::HasNext() const {
+  while (!channel_->IsClosed() && !channel_->CanReceive()) {
+  }
+  return channel_->CanReceive();
+}
 void DoubleBufferReader::PrefetchThreadFunc() {
  VLOG(5) << "A new prefetch thread starts.";
  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
  size_t cached_tensor_id = 0;
-  while (reader_->HasNext()) {
+  while (true) {
    Item batch;
    auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
    reader_->ReadNext(&cpu_batch);
+    if (cpu_batch.empty()) {
+      // The underlying reader have no next data.
+      break;
+    }
    if (platform::is_gpu_place(place_)) {
      auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
      auto* gpu_ctx = ctxs_[cached_tensor_id].get();

--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -25,22 +25,12 @@ class MultiPassReader : public framework::DecoratedReader {
      : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
  void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    if (!HasNext()) {
-      PADDLE_THROW("There is no next data!");
-    }
    reader_->ReadNext(out);
-  }
+    if (out->empty()) {
-  bool HasNext() const override {
-    if (reader_->HasNext()) {
-      return true;
-    } else {
      ++pass_count_;
-      if (pass_count_ >= pass_num_) {
+      if (pass_count_ < pass_num_) {
-        return false;
-      } else {
        reader_->ReInit();
-        return true;
+        reader_->ReadNext(out);
      }
    }
  }

--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
@@ -52,8 +52,6 @@ class RandomDataGenerator : public framework::ReaderBase {
  void ReInit() override { return; }
-  bool HasNext() const override { return true; }
 private:
  float min_;
  float max_;
@@ -74,7 +72,7 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase {
    const auto& ranks = Attr<std::vector<int>>("ranks");
    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
+                      static_cast<int>(shape_concat.size()),
                      "The accumulate of all ranks should be equal to the "
                      "shape concat's length.");
    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);

--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <mutex>
-#include <thread>
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 #include "paddle/fluid/recordio/scanner.h"
@@ -35,17 +33,15 @@ class RecordIOFileReader : public framework::FileReader {
    LOG(INFO) << "Creating file reader" << filename;
  }
-  bool HasNext() const override { return scanner_.HasNext(); }
  void ReInit() override { scanner_.Reset(); }
 protected:
  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
    if (ThreadSafe) {
      std::lock_guard<std::mutex> guard(*mutex_);
-      *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
    } else {
-      *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
    }
  }
@@ -66,7 +62,7 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
    const auto& ranks = Attr<std::vector<int>>("ranks");
    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
+                      static_cast<int>(shape_concat.size()),
                      "The accumulate of all ranks should be equal to the "
                      "shape concat's length.");
    std::string filename = Attr<std::string>("filename");

--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -30,35 +30,33 @@ class ShuffleReader : public framework::DecoratedReader {
      std::random_device device;
      seed_ = device();
    }
-    ReadIntoBuffers();
+    ReloadBuffer();
  }
  void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    if (!HasNext()) {
+    out->clear();
-      PADDLE_THROW("There is no next data!");
-    }
    if (iteration_pos_ >= buffer_.size()) {
      VLOG(10) << "Resetting shuffle buffer";
-      ReadIntoBuffers();
+      ReloadBuffer();
+      if (buffer_.empty()) {
+        return;
+      }
    }
    *out = buffer_[iteration_pos_++];
  }
-  bool HasNext() const override {
-    return iteration_pos_ < buffer_.size() || reader_->HasNext();
-  }
 private:
-  void ReadIntoBuffers() {
+  void ReloadBuffer() {
    buffer_.clear();
    buffer_.reserve(buffer_size_);
    iteration_pos_ = 0;
    for (size_t i = 0; i < buffer_size_; ++i) {
-      if (!reader_->HasNext()) {
+      std::vector<framework::LoDTensor> ins;
+      reader_->ReadNext(&ins);
+      if (ins.empty()) {
        break;
      }
-      buffer_.emplace_back();
+      buffer_.emplace_back(ins);
-      reader_->ReadNext(&buffer_.back());
    }
    std::mt19937 g(seed_);
    std::shuffle(buffer_.begin(), buffer_.end(), g);

--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+class ThreadedReader : public framework::DecoratedReader {
+ public:
+  ThreadedReader(ReaderBase* reader, bool safe_mode)
+      : DecoratedReader(reader), safe_mode_(safe_mode) {}
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    reader_->ReadNext(out);
+  }
+  void ReInit() override {
+    if (safe_mode_) {
+      PADDLE_THROW(
+          "ThreadedReader::ReInit() is disabled when 'safe_mode' is true.");
+    }
+    VLOG(5) << "ThreadedReader::ReInit() is invoked! It might be buggy in "
+               "multi-thread environment.";
+    reader_->ReInit();
+  }
+ private:
+  bool safe_mode_;
+  std::mutex mutex_;
+};
+class CreateThreadedReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    bool safe_mode = Attr<bool>("safe_mode");
+    out->Reset(new ThreadedReader(underlying_reader.Get(), safe_mode));
+  }
+};
+class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
+ public:
+  CreateThreadedReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : DecoratedReaderMakerBase(op_proto, op_checker) {
+    AddAttr<bool>("safe_mode",
+                  "When 'safe_mode' is true, 'ReInit()' is disabled to avoid "
+                  "unexpected bugs in multi-thread environment.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+      CreateThreadedReader Operator
+      This operator creates a threaded reader. A threaded reader's 
+      'ReadNext()' can be invoked by several threads at the same 
+      time. 
+      When the attribute 'safe_mode' is true, the threaded reader's 
+      'ReInit()' is disabled to avoid unexpected bugs in multi-thread 
+      environment.
+    )DOC");
+  }
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+namespace reader = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_threaded_reader,
+                                   reader::CreateThreadedReaderOp,
+                                   reader::CreateThreadedReaderOpMaker);
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <thread>  // NOLINT
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
@@ -19,38 +21,23 @@ namespace paddle {
 namespace operators {
 namespace reader {
-class MultipleReader : public framework::ReaderBase {
+class MultiFileReader : public framework::ReaderBase {
 public:
-  class ThreadBufferMap {
+  MultiFileReader(const std::vector<std::string>& file_names,
-   public:
+                  const std::vector<framework::DDim>& dims, size_t thread_num,
-    std::vector<framework::LoDTensor>& operator[](
+                  size_t buffer_size)
-        const std::thread::id& thread_id) {
+      : file_names_(file_names), dims_(dims), buffer_size_(buffer_size) {
-      std::lock_guard<std::mutex> lock(mutex_);
-      return buffer_[thread_id];
-    }
-    void Clear() { buffer_.clear(); }
-   private:
-    std::mutex mutex_;
-    std::unordered_map<std::thread::id, std::vector<framework::LoDTensor>>
-        buffer_;
-  };
-  MultipleReader(const std::vector<std::string>& file_names,
-                 const std::vector<framework::DDim>& dims, size_t thread_num)
-      : file_names_(file_names), dims_(dims) {
    prefetchers_.resize(thread_num);
    StartNewScheduler();
  }
  void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  bool HasNext() const override;
  void ReInit() override;
-  ~MultipleReader() { EndScheduler(); }
+  ~MultiFileReader() { EndScheduler(); }
 private:
+  bool HasNext();
  void StartNewScheduler();
  void EndScheduler();
  void ScheduleThreadFunc();
@@ -60,39 +47,36 @@ class MultipleReader : public framework::ReaderBase {
  std::vector<framework::DDim> dims_;
  std::thread scheduler_;
  std::vector<std::thread> prefetchers_;
+  size_t buffer_size_;
  framework::Channel<size_t>* waiting_file_idx_;
  framework::Channel<size_t>* available_thread_idx_;
  framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
-  mutable ThreadBufferMap thread_buffer_map_;
 };
-void MultipleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  if (!HasNext()) {
+  out->clear();
-    PADDLE_THROW("There is no next data!");
+  if (HasNext()) {
+    buffer_->Receive(out);
  }
-  auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
-  *out = thread_local_buffer;
-  thread_local_buffer.clear();
-}
-bool MultipleReader::HasNext() const {
-  auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
-  return thread_local_buffer.empty() ? buffer_->Receive(&thread_local_buffer)
-                                     : true;
 }
-void MultipleReader::ReInit() {
+void MultiFileReader::ReInit() {
  EndScheduler();
-  thread_buffer_map_.Clear();
  StartNewScheduler();
 }
-void MultipleReader::StartNewScheduler() {
+bool MultiFileReader::HasNext() {
+  while (!buffer_->IsClosed() && !buffer_->CanReceive()) {
+  }
+  return buffer_->CanReceive();
+}
+void MultiFileReader::StartNewScheduler() {
  size_t thread_num = prefetchers_.size();
  waiting_file_idx_ = framework::MakeChannel<size_t>(file_names_.size());
  available_thread_idx_ = framework::MakeChannel<size_t>(thread_num);
  buffer_ =
-      framework::MakeChannel<std::vector<framework::LoDTensor>>(thread_num);
+      framework::MakeChannel<std::vector<framework::LoDTensor>>(buffer_size_);
  for (size_t i = 0; i < file_names_.size(); ++i) {
    waiting_file_idx_->Send(&i);
@@ -105,7 +89,7 @@ void MultipleReader::StartNewScheduler() {
  scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
 }
-void MultipleReader::EndScheduler() {
+void MultiFileReader::EndScheduler() {
  available_thread_idx_->Close();
  buffer_->Close();
  waiting_file_idx_->Close();
@@ -117,8 +101,8 @@ void MultipleReader::EndScheduler() {
  delete waiting_file_idx_;
 }
-void MultipleReader::ScheduleThreadFunc() {
+void MultiFileReader::ScheduleThreadFunc() {
-  VLOG(5) << "MultipleReader schedule thread starts.";
+  VLOG(5) << "MultiFileReader schedule thread starts.";
  size_t completed_thread_num = 0;
  size_t thread_idx;
  while (available_thread_idx_->Receive(&thread_idx)) {
@@ -150,17 +134,20 @@ void MultipleReader::ScheduleThreadFunc() {
      p.join();
    }
  }
-  VLOG(5) << "MultipleReader schedule thread terminates.";
+  VLOG(5) << "MultiFileReader schedule thread terminates.";
 }
-void MultipleReader::PrefetchThreadFunc(std::string file_name,
+void MultiFileReader::PrefetchThreadFunc(std::string file_name,
-                                        size_t thread_idx) {
+                                         size_t thread_idx) {
  VLOG(5) << "The prefetch thread of file '" << file_name << "' starts.";
  std::unique_ptr<framework::ReaderBase> reader =
      CreateReaderByFileName(file_name, dims_);
-  while (reader->HasNext()) {
+  while (true) {
    std::vector<framework::LoDTensor> ins;
    reader->ReadNext(&ins);
+    if (ins.empty()) {
+      break;
+    }
    try {
      buffer_->Send(&ins);
    } catch (paddle::platform::EnforceNotMet e) {
@@ -197,11 +184,13 @@ class OpenFilesOp : public framework::OperatorBase {
    const auto& file_names = Attr<std::vector<std::string>>("file_names");
    PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
    const size_t thread_num = Attr<int>("thread_num");
+    const size_t buffer_size = Attr<int>("buffer_size");
    auto* out = scope.FindVar(Output("Out"))
                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new MultipleReader(
+    out->Reset(new MultiFileReader(file_names,
-        file_names, RestoreShapes(shape_concat, ranks), thread_num));
+                                   RestoreShapes(shape_concat, ranks),
+                                   thread_num, buffer_size));
  }
 };
@@ -212,11 +201,12 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
    AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
    AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
        .GreaterThan(0);
+    AddAttr<int>("buffer_size", "The size of prefetch buffer.").GreaterThan(0);
    AddComment(R"DOC(
      OpenFiles Operator
-      An OpenFilesOp creates a MultipleReader, which is able to 
+      An OpenFilesOp creates a MultiFileReader, which is able to 
      read data multi-threaded from multiple files.
    )DOC");
  }

--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/spp_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"

--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/sum_op.h"
+#include <algorithm>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"

--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <iostream>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/transpose_op.h"
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/unpool_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/unpooling.h"

--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -42,12 +42,12 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
+cc_library(device_context SRCS device_context.cc DEPS malloc
-    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+    place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
-nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
+nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)

--- a/paddle/fluid/platform/call_once.h
+++ b/paddle/fluid/platform/call_once.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <mutex>
-namespace paddle {
-namespace platform {
-/*
- The current implementation of std::call_once has a bug described in
- https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call.
- This is likely caused by a deeper bug of pthread_once, which is discussed in
- https://patchwork.ozlabs.org/patch/482350/
- This wrap is a hack to avoid this bug.
-*/
-template <typename Callable, typename... Args>
-inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
-  bool good = true;
-  std::exception ex;
-  try {
-    std::call_once(flag,
-                   [&](Args&&... args) {
-                     try {
-                       f(args...);
-                     } catch (const std::exception& e) {
-                       ex = e;
-                       good = false;
-                     } catch (...) {
-                       ex = std::runtime_error("excption caught in call_once");
-                       good = false;
-                     }
-                   },
-                   args...);
-  } catch (std::system_error& x) {
-    throw std::runtime_error("call once failed");
-  }
-  if (!good) {
-    throw std::exception(ex);
-  }
-}
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -33,22 +33,26 @@ constexpr int PADDLE_CUDA_NUM_THREADS = 512;
 USE_CUDA_ATOMIC(Add, float);
 USE_CUDA_ATOMIC(Add, int);
 USE_CUDA_ATOMIC(Add, unsigned int);
-USE_CUDA_ATOMIC(Add, unsigned long long int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+USE_CUDA_ATOMIC(Add, unsigned long long int);  // NOLINT
 CUDA_ATOMIC_WRAPPER(Add, int64_t) {
-  static_assert(sizeof(int64_t) == sizeof(long long int),
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
                "long long should be int64");
-  return CudaAtomicAdd(reinterpret_cast<unsigned long long int*>(address),
+  return CudaAtomicAdd(
-                       static_cast<unsigned long long int>(val));
+      reinterpret_cast<unsigned long long int*>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));           // NOLINT
 }
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int* address_as_ull =
+  unsigned long long int* address_as_ull =                 // NOLINT
-      reinterpret_cast<unsigned long long int*>(address);
+      reinterpret_cast<unsigned long long int*>(address);  // NOLINT
-  unsigned long long int old = *address_as_ull, assumed;
+  unsigned long long int old = *address_as_ull, assumed;   // NOLINT
  do {
    assumed = old;
@@ -62,53 +66,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
 }
 #endif
-// __shfl_down has been deprecated as of CUDA 9.0.
-#if CUDA_VERSION < 9000
-template <typename T>
-__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
-  return __shfl_down(val, delta);
-}
-#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
-#else
-#define FULL_WARP_MASK 0xFFFFFFFF
-#define CREATE_SHFL_MASK(mask, predicate) \
-  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-#endif
-template <typename T>
-__device__ T reduceSum(T val, int tid, int len) {
-  // TODO(zcd): The warp size should be taken from the
-  // parameters of the GPU but not specified as 32 simply.
-  // To make the reduceSum more efficiently,
-  // I use Warp-Level Parallelism and assume the Warp size
-  // is 32 which may be different for different GPU,
-  // but most card's warp size is 32.
-  __shared__ T shm[32];
-  const int warpSize = 32;
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, tid < len);
-  for (int offset = warpSize / 2; offset > 0; offset /= 2)
-    val += __shfl_down_sync(mask, val, offset);
-  if (tid < warpSize) shm[tid] = 0;
-  __syncthreads();
-  if (tid % warpSize == 0) {
-    shm[tid / warpSize] = val;
-  }
-  CREATE_SHFL_MASK(mask, tid < warpSize);
-  if (tid < warpSize) {
-    val = shm[tid];
-    for (int offset = warpSize / 2; offset > 0; offset /= 2)
-      val += __shfl_down_sync(mask, val, offset);
-  }
-  return val;
-}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -8,10 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
+#include <string>
 #include <unordered_set>
+#include <vector>
 #include "paddle/fluid/memory/memory.h"
 namespace paddle {
 namespace platform {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -8,11 +8,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <memory>
+#include <string>
 #include <unordered_map>
+#include <vector>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"

--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"
+#include <vector>
 #include "glog/logging.h"
+#include "gtest/gtest.h"
 TEST(Device, Init) {
  using paddle::platform::DeviceContext;

--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -11,15 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_tracer.h"
-#include <google/protobuf/text_format.h>
+#include <deque>
 #include <fstream>
 #include <map>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include <numeric>
-#include <thread>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
 #include "glog/logging.h"
+#include "google/protobuf/text_format.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/string/printf.h"
@@ -123,7 +127,7 @@ void DisableActivity() {
 void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
                              size_t *maxNumRecords) {
-  uint8_t *buf = (uint8_t *)malloc(kBufSize + kAlignSize);
+  uint8_t *buf = reinterpret_cast<uint8_t *>(malloc(kBufSize + kAlignSize));
  *size = kBufSize;
  *buffer = ALIGN_BUFFER(buf, kAlignSize);
  *maxNumRecords = 0;

--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -11,8 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/profiler.pb.h"

--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -28,6 +28,10 @@ CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
 CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
 #endif
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R3
+CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -71,7 +71,6 @@ extern void *cublas_dso_handle;
  __macro(cublasDgemm_v2);                \
  __macro(cublasHgemm);                   \
  __macro(cublasSgemmEx);                 \
-  __macro(cublasGemmEx);                  \
  __macro(cublasSgeam_v2);                \
  __macro(cublasDgeam_v2);                \
  __macro(cublasCreate_v2);               \
@@ -83,11 +82,6 @@ extern void *cublas_dso_handle;
  __macro(cublasDgemmBatched);            \
  __macro(cublasCgemmBatched);            \
  __macro(cublasZgemmBatched);            \
-  __macro(cublasSgemmStridedBatched);     \
-  __macro(cublasDgemmStridedBatched);     \
-  __macro(cublasCgemmStridedBatched);     \
-  __macro(cublasZgemmStridedBatched);     \
-  __macro(cublasHgemmStridedBatched);     \
  __macro(cublasSgetrfBatched);           \
  __macro(cublasSgetriBatched);           \
  __macro(cublasDgetrfBatched);           \
@@ -95,10 +89,24 @@ extern void *cublas_dso_handle;
 CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+// APIs available after CUDA 8.0
+#if CUDA_VERSION >= 8000
+#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
+  __macro(cublasGemmEx);                     \
+  __macro(cublasSgemmStridedBatched);        \
+  __macro(cublasDgemmStridedBatched);        \
+  __macro(cublasCgemmStridedBatched);        \
+  __macro(cublasZgemmStridedBatched);        \
+  __macro(cublasHgemmStridedBatched);
+CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
 // APIs available after CUDA 9.0
 #if CUDA_VERSION >= 9000
-#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) __macro(cublasSetMathMode);
+#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode);
-CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
 #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP

--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/call_once.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 namespace paddle {

--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -1003,6 +1003,46 @@ HOSTDEVICE inline float16 exp(const float16& a) {
  return float16(::expf(static_cast<float>(a)));
 }
+template <>
+HOSTDEVICE inline float16 log(const float16& a) {
+  return float16(::logf(static_cast<float>(a)));
+}
+template <>
+HOSTDEVICE inline float16 tanh(const float16& a) {
+  return float16(::tanhf(static_cast<float>(a)));
+}
+template <>
+HOSTDEVICE inline float16 sqrt(const float16& a) {
+  return float16(::sqrtf(static_cast<float>(a)));
+}
+template <>
+HOSTDEVICE inline float16 ceil(const float16& a) {
+  return float16(::ceilf(static_cast<float>(a)));
+}
+template <>
+HOSTDEVICE inline float16 floor(const float16& a) {
+  return float16(::floorf(static_cast<float>(a)));
+}
+template <>
+HOSTDEVICE inline float16 round(const float16& a) {
+  return float16(::roundf(static_cast<float>(a)));
+}
+template <>
+HOSTDEVICE inline float16 pow(const float16& a, const float16& b) {
+  return float16(::powf(static_cast<float>(a), static_cast<float>(b)));
+}
+template <>
+HOSTDEVICE inline float16 abs(const float16& a) {
+  return float16(::fabs(static_cast<float>(a)));
+}
 }  // namespace numext
 }  // namespace Eigen
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
+#include <vector>
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include <gtest/gtest.h>
 namespace paddle {
 namespace platform {
@@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) {
  // Conversion operator
  EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
-  EXPECT_EQ(float(float16(0.5f)), 0.5f);
+  EXPECT_EQ(static_cast<float>(float16(0.5f)), 0.5f);
-  EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001);
+  EXPECT_NEAR(static_cast<double>(float16(0.33333)), 0.33333, 0.0001);
-  EXPECT_EQ(int(float16(-1)), -1);
+  EXPECT_EQ(static_cast<int>(float16(-1)), -1);
-  EXPECT_EQ(bool(float16(true)), true);
+  EXPECT_EQ(static_cast<bool>(float16(true)), true);
 }
 TEST(float16, arithmetic_cpu) {
-  EXPECT_EQ(float(float16(1) + float16(1)), 2);
+  EXPECT_EQ(static_cast<float>(float16(1) + float16(1)), 2);
-  EXPECT_EQ(float(float16(5) + float16(-5)), 0);
+  EXPECT_EQ(static_cast<float>(float16(5) + float16(-5)), 0);
-  EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001);
+  EXPECT_NEAR(static_cast<float>(float16(0.33333f) + float16(0.66667f)), 1.0f,
-  EXPECT_EQ(float(float16(3) - float16(5)), -2);
+              0.001);
-  EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001);
+  EXPECT_EQ(static_cast<float>(float16(3) - float16(5)), -2);
-  EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(static_cast<float>(float16(0.66667f) - float16(0.33333f)),
-  EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
+              0.33334f, 0.001);
-  EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001);
+  EXPECT_NEAR(static_cast<float>(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
-  EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f);
+  EXPECT_NEAR(static_cast<float>(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
-  EXPECT_EQ(float(-float16(512.0f)), -512.0f);
+  EXPECT_NEAR(static_cast<float>(float16(2.0f) / float16(3.0f)), 0.66667f,
-  EXPECT_EQ(float(-float16(-512.0f)), 512.0f);
+              0.001);
+  EXPECT_EQ(static_cast<float>(float16(1.0f) / float16(2.0f)), 0.5f);
+  EXPECT_EQ(static_cast<float>(-float16(512.0f)), -512.0f);
+  EXPECT_EQ(static_cast<float>(-float16(-512.0f)), 512.0f);
 }
 TEST(float16, comparison_cpu) {

--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -36,19 +36,19 @@ limitations under the License. */
    half *in1, *in2, *out;                                    \
    half *d_in1, *d_in2, *d_out;                              \
    int size = sizeof(half);                                  \
-    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
-    cudaMalloc((void**)&d_in2, size);                         \
+    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
-    cudaMalloc((void**)&d_out, size);                         \
+    cudaMalloc(reinterpret_cast<void**>(&d_out), size);       \
-    in1 = (half*)malloc(size);                                \
+    in1 = reinterpret_cast<half*>(malloc(size));              \
-    in2 = (half*)malloc(size);                                \
+    in2 = reinterpret_cast<half*>(malloc(size));              \
-    out = (half*)malloc(size);                                \
+    out = reinterpret_cast<half*>(malloc(size));              \
    in1[0] = half(float16(v_in1));                            \
    in2[0] = half(float16(v_in2));                            \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(float(float16(out[0])), v_out);                 \
+    EXPECT_EQ(static_cast<float>(float16(out[0])), v_out);    \
    free(in1);                                                \
    free(in2);                                                \
    free(out);                                                \
@@ -63,17 +63,17 @@ limitations under the License. */
    half *in1, *in2;                                          \
    half *d_in1, *d_in2;                                      \
    int size = sizeof(half);                                  \
-    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
-    cudaMalloc((void**)&d_in2, size);                         \
+    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
-    in1 = (half*)malloc(size);                                \
+    in1 = reinterpret_cast<half*>(malloc(size));              \
-    in2 = (half*)malloc(size);                                \
+    in2 = reinterpret_cast<half*>(malloc(size));              \
    in1[0] = half(float16(v_in1));                            \
    in2[0] = half(float16(v_in2));                            \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
    op_type<<<1, 1>>>(d_in1, d_in2);                          \
    cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(float(float16(in1[0])), v_out);                 \
+    EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out);    \
    free(in1);                                                \
    free(in2);                                                \
    cudaFree(d_in1);                                          \
@@ -87,12 +87,12 @@ limitations under the License. */
    half *d_in1, *d_in2;                                     \
    bool *out, *d_out;                                       \
    int size = sizeof(half);                                 \
-    cudaMalloc((void**)&d_in1, size);                        \
+    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);      \
-    cudaMalloc((void**)&d_in2, size);                        \
+    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);      \
-    cudaMalloc((void**)&d_out, 1);                           \
+    cudaMalloc(reinterpret_cast<void**>(&d_out), 1);         \
-    in1 = (half*)malloc(size);                               \
+    in1 = reinterpret_cast<half*>(malloc(size));             \
-    in2 = (half*)malloc(size);                               \
+    in2 = reinterpret_cast<half*>(malloc(size));             \
-    out = (bool*)malloc(1);                                  \
+    out = reinterpret_cast<bool*>(malloc(1));                \
    in1[0] = half(float16(v_in1));                           \
    in2[0] = half(float16(v_in2));                           \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
@@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) {
  LOG(INFO) << "Test Neg on GPU!";
  half *in, *d_in;
  int size = sizeof(half);
-  cudaMalloc((void**)&d_in, size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in), size);
-  in = (half*)malloc(size);
+  in = reinterpret_cast<half*>(malloc(size));
  in[0] = half(float16(v_in));
  cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
  Neg<<<1, 1>>>(d_in);
  cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
-  EXPECT_EQ(float(float16(in[0])), v_out);
+  EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
  free(in);
  cudaFree(d_in);
 }

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include <mkldnn.hpp>
+#include <vector>
+#include "mkldnn/include/mkldnn.hpp"
 #include "paddle/fluid/framework/operator.h"
 namespace paddle {

--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -14,8 +14,9 @@
 #pragma once
-#include <thread>
+#include <thread>  // NOLINT
 #include <typeindex>
+#include <vector>
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -29,6 +30,8 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
    return ncclDouble;
  } else if (type == typeid(int)) {  // NOLINT
    return ncclInt;
+  } else if (type == typeid(int64_t)) {  // NOLINT
+    return ncclInt64;
  } else {
    PADDLE_THROW("Not supported");
  }
@@ -58,7 +61,7 @@ struct NCCLContext {
  ncclComm_t comm_;
  explicit NCCLContext(int dev_id)
-      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))) {}
+      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
  cudaStream_t stream() const { return ctx_->stream(); }
@@ -66,23 +69,23 @@ struct NCCLContext {
    return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
  }
-  static void InitNCCLContext(std::unordered_map<int, NCCLContext> &contexts,
+  static void InitNCCLContext(std::unordered_map<int, NCCLContext> *contexts,
                              const std::vector<platform::Place> &places) {
    std::vector<ncclComm_t> comms;
    std::vector<int> devs;
-    comms.resize(contexts.size());
+    comms.resize(contexts->size());
-    devs.reserve(contexts.size());
+    devs.reserve(contexts->size());
    for (auto &p : places) {
      devs.push_back(boost::get<platform::CUDAPlace>(p).device);
    }
    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-        &comms[0], static_cast<int>(contexts.size()), &devs[0]));
+        &comms[0], static_cast<int>(contexts->size()), &devs[0]));
    int i = 0;
    for (auto &dev_id : devs) {
-      contexts.at(dev_id).comm_ = comms[i++];
+      contexts->at(dev_id).comm_ = comms[i++];
    }
  }
 };
@@ -91,7 +94,8 @@ struct NCCLContextMap {
  std::unordered_map<int, NCCLContext> contexts_;
  std::vector<int> order_;
-  NCCLContextMap(const std::vector<platform::Place> &places) {
+  explicit NCCLContextMap(const std::vector<platform::Place> &places) {
+    PADDLE_ENFORCE(!places.empty());
    order_.reserve(places.size());
    for (auto &p : places) {
      int dev_id = boost::get<CUDAPlace>(p).device;
@@ -102,15 +106,17 @@ struct NCCLContextMap {
        order_.size(), contexts_.size(),
        "NCCL Context Map does not support contain two or more same device");
-    std::vector<ncclComm_t> comms;
+    if (places.size() > 1) {
-    comms.resize(order_.size());
+      std::vector<ncclComm_t> comms;
+      comms.resize(order_.size());
-    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+      PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-        &comms[0], static_cast<int>(order_.size()), &order_[0]));
+          &comms[0], static_cast<int>(order_.size()), &order_[0]));
-    int i = 0;
+      int i = 0;
-    for (auto &dev_id : order_) {
+      for (auto &dev_id : order_) {
-      contexts_.at(dev_id).comm_ = comms[i++];
+        contexts_.at(dev_id).comm_ = comms[i++];
+      }
    }
  }

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -15,8 +15,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include <sys/time.h>
 #include <time.h>
+#include <algorithm>
 #include <iomanip>
 #include <map>
+#include <mutex>  // NOLINT
+#include <string>
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
@@ -28,10 +31,10 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
+struct EventList;
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
-// To record which timer the profiler used, CUDA or CPU.
-static std::string g_profiler_place = "";
 // The thread local event list only can be accessed by the specific thread
 // The thread index of each thread
 static thread_local int32_t g_thread_id;
@@ -45,6 +48,39 @@ static std::list<std::shared_ptr<EventList>> g_all_event_lists;
 // The thread local event list only can be accessed by the specific thread
 static thread_local std::shared_ptr<EventList> g_event_list;
+struct EventList {
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(Event);
+  constexpr static size_t kEventAlign = alignof(Event);
+  constexpr static size_t kNumBlock =
+      kEventBlockSize /
+      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
+  template <typename... Args>
+  void Record(Args&&... args) {
+    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
+      event_blocks.emplace_front();
+      event_blocks.front().reserve(kNumBlock);
+    }
+    event_blocks.front().emplace_back(std::forward<Args>(args)...);
+  }
+  std::vector<Event> Reduce() {
+    std::vector<Event> result;
+    for (auto& block : event_blocks) {
+      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+                    std::make_move_iterator(block.end()));
+    }
+    event_blocks.clear();
+    return result;
+  }
+  void Clear() { event_blocks.clear(); }
+  std::forward_list<std::vector<Event>> event_blocks;
+};
 inline uint64_t GetTimeInNsec() {
  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
                                 std::chrono::high_resolution_clock,
@@ -60,9 +96,9 @@ inline uint64_t PosixInNsec() {
  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
 }
-Event::Event(EventKind kind, std::string name, uint32_t thread_id,
+Event::Event(EventType type, std::string name, uint32_t thread_id,
             const DeviceContext* dev_ctx)
-    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
+    : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
 #ifdef PADDLE_WITH_CUDA
  has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
  if (has_cuda_) {
@@ -76,17 +112,7 @@ Event::Event(EventKind kind, std::string name, uint32_t thread_id,
  cpu_ns_ = GetTimeInNsec();
 }
-std::string Event::kind() const {
+const EventType& Event::type() const { return type_; }
-  switch (kind_) {
-    case EventKind::kMark:
-      return "mark";
-    case EventKind::kPushRange:
-      return "push";
-    case EventKind::kPopRange:
-      return "pop";
-  }
-  PADDLE_THROW("Unknown EventKind.");
-}
 double Event::CpuElapsedMs(const Event& e) const {
  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
@@ -129,15 +155,15 @@ inline EventList& GetEventList() {
 }
 void Mark(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
+  GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx);
 }
 void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
+  GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx);
 }
 void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
+  GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx);
 }
 RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
@@ -197,12 +223,7 @@ void EnableProfiler(ProfilerState state) {
                 "The profiling state should be disabled when calling ",
                 "EnableProfiler.");
  g_state = state;
-  if (g_state == ProfilerState::kCUDA) {
+  if (g_state == ProfilerState::kAll) {
-    g_profiler_place = "CUDA";
-  } else if (g_state == ProfilerState::kCPU) {
-    g_profiler_place = "CPU";
-  } else {
-    g_profiler_place = "All";
    GetDeviceTracer()->Enable();
  }
 #ifdef PADDLE_WITH_CUDA
@@ -240,27 +261,63 @@ std::vector<std::vector<Event>> GetAllEvents() {
  return result;
 }
-void DisableProfiler(EventSortingKey sorted_key,
+// The information of each event given in the profiling report
-                     const std::string& profile_path) {
+struct EventItem {
-  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
+  std::string name;
-                 "Can't disable profiling, since it's not starting.");
+  int calls;
-  // Mark the profiling stop.
+  double total_time;
-  Mark("_stop_profiler_", nullptr);
+  double min_time;
-  g_state = ProfilerState::kDisabled;
+  double max_time;
+  double ave_time;
+};
+// Print results
+void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
+                   const std::string& sorted_domain, const size_t name_width,
+                   const size_t data_width) {
+  // Output header information
+  std::cout << "\n------------------------->"
+            << "     Profiling Report     "
+            << "<-------------------------\n\n";
+  std::string place;
+  if (g_state == ProfilerState::kCPU) {
+    place = "CPU";
+  } else if (g_state == ProfilerState::kCUDA) {
+    place = "CUDA";
+  } else if (g_state == ProfilerState::kAll) {
+    place = "All";
+  } else {
+    PADDLE_THROW("Invalid profiler state");
+  }
-  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  std::cout << "Place: " << place << std::endl;
-  ParseEvents(all_events, sorted_key);
+  std::cout << "Time unit: ms" << std::endl;
-  ResetProfiler();
+  std::cout << "Sorted by " << sorted_domain
-  DeviceTracer* tracer = GetDeviceTracer();
+            << " in descending order in the same thread\n\n";
-  if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
+  // Output events table
-    tracer->Disable();
+  std::cout.setf(std::ios::left);
-    tracer->GenProfile(profile_path);
+  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+            << "Calls" << std::setw(data_width) << "Total"
+            << std::setw(data_width) << "Min." << std::setw(data_width)
+            << "Max." << std::setw(data_width) << "Ave." << std::endl;
+  for (size_t i = 0; i < events_table.size(); ++i) {
+    for (size_t j = 0; j < events_table[i].size(); ++j) {
+      const EventItem& event_item = events_table[i][j];
+      std::cout << std::setw(name_width) << event_item.name
+                << std::setw(data_width) << event_item.calls
+                << std::setw(data_width) << event_item.total_time
+                << std::setw(data_width) << event_item.min_time
+                << std::setw(data_width) << event_item.max_time
+                << std::setw(data_width) << event_item.ave_time << std::endl;
+    }
  }
+  std::cout << std::endl;
 }
-void ParseEvents(std::vector<std::vector<Event>>& events,
+// Parse the event list and output the profiling report
-                 EventSortingKey sorted_by) {
+void ParseEvents(const std::vector<std::vector<Event>>& events,
-  if (g_profiler_place == "") return;
+                 EventSortingKey sorted_by = EventSortingKey::kDefault) {
+  if (g_state == ProfilerState::kDisabled) return;
  std::string sorted_domain;
  std::function<bool(const EventItem&, const EventItem&)> sorted_func;
@@ -307,9 +364,9 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
    std::unordered_map<std::string, int> event_idx;
    for (size_t j = 0; j < events[i].size(); j++) {
-      if (events[i][j].kind() == "push") {
+      if (events[i][j].type() == EventType::kPushRange) {
        pushed_events.push_back(events[i][j]);
-      } else if (events[i][j].kind() == "pop") {
+      } else if (events[i][j].type() == EventType::kPopRange) {
        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
        while (rit != pushed_events.rend() &&
               rit->name() != events[i][j].name()) {
@@ -317,10 +374,10 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
        }
        if (rit != pushed_events.rend()) {
-          double event_time =
+          double event_time = (g_state == ProfilerState::kCUDA ||
-              (g_profiler_place == "CUDA" || g_profiler_place == "All")
+                               g_state == ProfilerState::kAll)
-                  ? rit->CudaElapsedMs(events[i][j])
+                                  ? rit->CudaElapsedMs(events[i][j])
-                  : rit->CpuElapsedMs(events[i][j]);
+                                  : rit->CpuElapsedMs(events[i][j]);
          std::string event_name =
              "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
@@ -376,35 +433,22 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
 }
-void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
+void DisableProfiler(EventSortingKey sorted_key,
-                   std::string& sorted_domain, const size_t name_width,
+                     const std::string& profile_path) {
-                   const size_t data_width) {
+  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
-  // Output header information
+                 "Can't disable profiling, since it's not starting.");
-  std::cout << "\n------------------------->"
+  // Mark the profiling stop.
-            << "     Profiling Report     "
+  Mark("_stop_profiler_", nullptr);
-            << "<-------------------------\n\n";
-  std::cout << "Place: " << g_profiler_place << std::endl;
+  std::vector<std::vector<Event>> all_events = GetAllEvents();
-  std::cout << "Time unit: ms" << std::endl;
+  ParseEvents(all_events, sorted_key);
-  std::cout << "Sorted by " << sorted_domain
+  ResetProfiler();
-            << " in descending order in the same thread\n\n";
+  DeviceTracer* tracer = GetDeviceTracer();
-  // Output events table
+  if (g_state == ProfilerState::kAll && tracer && tracer->IsEnabled()) {
-  std::cout.setf(std::ios::left);
+    tracer->Disable();
-  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+    tracer->GenProfile(profile_path);
-            << "Calls" << std::setw(data_width) << "Total"
-            << std::setw(data_width) << "Min." << std::setw(data_width)
-            << "Max." << std::setw(data_width) << "Ave." << std::endl;
-  for (size_t i = 0; i < events_table.size(); ++i) {
-    for (size_t j = 0; j < events_table[i].size(); ++j) {
-      EventItem& event_item = events_table[i][j];
-      std::cout << std::setw(name_width) << event_item.name
-                << std::setw(data_width) << event_item.calls
-                << std::setw(data_width) << event_item.total_time
-                << std::setw(data_width) << event_item.min_time
-                << std::setw(data_width) << event_item.max_time
-                << std::setw(data_width) << event_item.ave_time << std::endl;
-    }
  }
-  std::cout << std::endl;
+  g_state = ProfilerState::kDisabled;
 }
 }  // namespace platform

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <forward_list>
 #include <list>
-#include <mutex>
+#include <string>
 #include <vector>
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/profiler.pb.h"
@@ -23,16 +23,16 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
-enum EventKind { kMark, kPushRange, kPopRange };
+enum EventType { kMark, kPushRange, kPopRange };
 class Event {
 public:
  // The DeviceContext is used to get the cuda stream.
  // If CPU profiling mode, can pass nullptr.
-  Event(EventKind kind, std::string name, uint32_t thread_id,
+  Event(EventType type, std::string name, uint32_t thread_id,
        const DeviceContext* dev_ctx);
-  std::string kind() const;
+  const EventType& type() const;
  std::string name() const { return name_; }
  uint32_t thread_id() const { return thread_id_; }
  bool has_cuda() const { return has_cuda_; }
@@ -46,7 +46,7 @@ class Event {
  double CudaElapsedMs(const Event& e) const;
 private:
-  EventKind kind_;
+  EventType type_;
  std::string name_;
  uint32_t thread_id_;
  int64_t cpu_ns_;
@@ -57,39 +57,6 @@ class Event {
 #endif
 };
-struct EventList {
-  constexpr static size_t kMB = 1024 * 1024;
-  constexpr static size_t kEventBlockSize = 16 * kMB;
-  constexpr static size_t kEventSize = sizeof(Event);
-  constexpr static size_t kEventAlign = alignof(Event);
-  constexpr static size_t kNumBlock =
-      kEventBlockSize /
-      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
-  template <typename... Args>
-  void Record(Args&&... args) {
-    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
-      event_blocks.emplace_front();
-      event_blocks.front().reserve(kNumBlock);
-    }
-    event_blocks.front().emplace_back(std::forward<Args>(args)...);
-  }
-  std::vector<Event> Reduce() {
-    std::vector<Event> result;
-    for (auto& block : event_blocks) {
-      result.insert(result.begin(), std::make_move_iterator(block.begin()),
-                    std::make_move_iterator(block.end()));
-    }
-    event_blocks.clear();
-    return result;
-  }
-  void Clear() { event_blocks.clear(); }
-  std::forward_list<std::vector<Event>> event_blocks;
-};
 enum ProfilerState {
  kDisabled,  // disabled state
  kCPU,       // CPU profiling state
@@ -136,16 +103,6 @@ struct RecordThread {
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();
-// The information of each event given in the profiling report
-struct EventItem {
-  std::string name;
-  int calls;
-  double total_time;
-  double min_time;
-  double max_time;
-  double ave_time;
-};
 // Candidate keys to sort the profiling report
 enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
@@ -158,14 +115,5 @@ void ResetProfiler();
 void DisableProfiler(EventSortingKey sorted_key,
                     const std::string& profile_path);
-// Parse the event list and output the profiling report
-void ParseEvents(std::vector<std::vector<Event>>&,
-                 EventSortingKey sorted_by = EventSortingKey::kDefault);
-// Print results
-void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
-                   std::string& sorted_domain, const size_t name_width,
-                   const size_t data_width);
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -13,22 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
+#include <string>
 #ifdef PADDLE_WITH_CUDA
-#include "cuda_runtime.h"
+#include <cuda_runtime.h>
 #endif
 #include "gtest/gtest.h"
 TEST(Event, CpuElapsedTime) {
  using paddle::platform::Event;
-  using paddle::platform::EventKind;
+  using paddle::platform::EventType;
-  Event start_event(EventKind::kPushRange, "test", 0, nullptr);
+  Event start_event(EventType::kPushRange, "test", 0, nullptr);
  EXPECT_TRUE(start_event.has_cuda() == false);
  int counter = 0;
  while (counter != 1000) {
    counter++;
  }
-  Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
+  Event stop_event(EventType::kPopRange, "test", 0, nullptr);
  EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
 }
@@ -38,16 +39,16 @@ TEST(Event, CudaElapsedTime) {
  using paddle::platform::CUDADeviceContext;
  using paddle::platform::CUDAPlace;
  using paddle::platform::Event;
-  using paddle::platform::EventKind;
+  using paddle::platform::EventType;
  DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
-  Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
+  Event start_event(EventType::kPushRange, "test", 0, dev_ctx);
  EXPECT_TRUE(start_event.has_cuda() == true);
  int counter = 0;
  while (counter != 1000) {
    counter++;
  }
-  Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
+  Event stop_event(EventType::kPopRange, "test", 0, dev_ctx);
  EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
 }
 #endif
@@ -55,7 +56,7 @@ TEST(Event, CudaElapsedTime) {
 TEST(RecordEvent, RecordEvent) {
  using paddle::platform::DeviceContext;
  using paddle::platform::Event;
-  using paddle::platform::EventKind;
+  using paddle::platform::EventType;
  using paddle::platform::RecordEvent;
  using paddle::platform::ProfilerState;
  using paddle::platform::EventSortingKey;

--- a/paddle/fluid/pybind/.gitignore
+++ b/paddle/fluid/pybind/.gitignore
+pybind.h
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,13 +2,13 @@ if(WITH_PYTHON)
  if(WITH_AMD_GPU)
    hip_library(paddle_pybind SHARED
      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
           parallel_executor
      ${GLOB_OP_LIB})
  else()
    cc_library(paddle_pybind SHARED
      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
           parallel_executor
      ${GLOB_OP_LIB})
    if(NOT APPLE AND NOT ANDROID)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -252,7 +252,6 @@ All parameter, weight, gradient are variables in Paddle.
           py::return_value_policy::reference);
  py::class_<framework::ReaderHolder>(m, "Reader", "")
-      .def("has_next", &framework::ReaderHolder::HasNext)
      .def("reset", &framework::ReaderHolder::ReInit);
  py::class_<Scope>(m, "Scope", "")
@@ -465,7 +464,8 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("init_gflags", framework::InitGflags);
  m.def("init_glog", framework::InitGLOG);
-  m.def("init_devices", &framework::InitDevices);
+  m.def("init_devices",
+        [](bool init_p2p) { framework::InitDevices(init_p2p); });
  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
 #ifdef PADDLE_WITH_CUDA
@@ -544,13 +544,21 @@ All parameter, weight, gradient are variables in Paddle.
           [](ParallelExecutor &self, size_t num_threads, bool use_event,
              const std::vector<platform::Place> &places,
              const std::unordered_set<std::string> &params,
-              const ProgramDesc &startup_program,
+              const std::unordered_set<std::string> &bcast_vars,
              const ProgramDesc &main_program, const std::string &loss_var_name,
-              Scope *scope, bool allow_op_delay) {
+              Scope *scope, std::vector<Scope *> &local_scopes,
-             new (&self) ParallelExecutor(num_threads, use_event, places,
+              bool allow_op_delay) {
-                                          params, startup_program, main_program,
+             new (&self)
-                                          loss_var_name, scope, allow_op_delay);
+                 ParallelExecutor(num_threads, use_event, places, params,
+                                  bcast_vars, main_program, loss_var_name,
+                                  scope, local_scopes, allow_op_delay);
           })
+      .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
+      .def("local_scopes",
+           [](ParallelExecutor &self) -> std::vector<Scope *> * {
+             return &self.GetLocalScopes();
+           },
+           py::return_value_policy::reference)
      .def("run", &ParallelExecutor::Run);
  BindRecordIOWriter(&m);

--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
@@ -39,7 +39,7 @@ class RecordIOWriter {
  void CompleteAppendTensor() {
    auto& ctx =
        *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-    framework::WriteToRecordIO(writer_, tensors_, ctx);
+    framework::WriteToRecordIO(&writer_, tensors_, ctx);
    tensors_.clear();
  }

--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -6,6 +6,6 @@ if(WITH_TESTING)
  add_library(paddle_test_util STATIC TestUtil.cpp)
  add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
  if(NOT MOBILE_INFERENCE)
-    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags)
+    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init memory gtest gflags)
  endif()
 endif()
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -41,6 +41,6 @@ int main(int argc, char** argv) {
  paddle::memory::Used(paddle::platform::CUDAPlace(0));
 #endif
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
  return RUN_ALL_TESTS();
 }
--- a/python/.gitignore
+++ b/python/.gitignore
 *pyc
 build
 dist
+paddlepaddle.egg-info
 paddle.egg-info
 paddlepaddle_gpu.egg-info
 .idea

--- a/python/paddle/.gitignore
+++ b/python/paddle/.gitignore
+version.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -29,6 +29,7 @@ import optimizer
 import backward
 import regularizer
 import average
+import metrics
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
@@ -85,6 +86,8 @@ def __bootstrap__():
    import core
    import os
+    in_test = 'unittest' in sys.modules
    try:
        num_threads = int(os.getenv('OMP_NUM_THREADS', '1'))
    except ValueError:
@@ -109,8 +112,11 @@ def __bootstrap__():
    core.init_gflags([sys.argv[0]] +
                     ["--tryfromenv=" + ",".join(read_env_flags)])
    core.init_glog(sys.argv[0])
-    core.init_devices()
+    # don't init_p2p when in unittest to save time.
+    core.init_devices(not in_test)
+# TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.
+# Consider paddle.init(args) or paddle.main(args)
 layers.monkey_patch_variable()
 __bootstrap__()
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import numpy as np
+import warnings
 """
    Class of all kinds of Average.
@@ -22,6 +23,8 @@ import numpy as np
    wrappers of Python functions.
 """
+__all__ = ["WeightedAverage"]
 def _is_number_(var):
    return isinstance(var, int) or isinstance(var, float) or (isinstance(
@@ -34,6 +37,9 @@ def _is_number_or_matrix_(var):
 class WeightedAverage(object):
    def __init__(self):
+        warnings.warn(
+            "The %s is deprecated, please use fluid.metrics.Accuracy instead." %
+            (self.__class__.__name__), Warning)
        self.reset()
    def reset(self):

--- a/python/paddle/fluid/debuger.py
+++ b/python/paddle/fluid/debuger.py
@@ -16,6 +16,7 @@ import sys
 import re
 from graphviz import GraphPreviewGenerator
 import proto.framework_pb2 as framework_pb2
+from google.protobuf import text_format
 _vartype2str_ = [
    "UNK",
@@ -100,7 +101,7 @@ def repr_var(vardesc):
 def pprint_program_codes(program_desc):
    reprs = []
-    for block_idx in range(program_desc.num_blocks()):
+    for block_idx in range(program_desc.desc.num_blocks()):
        block_desc = program_desc.block(block_idx)
        block_repr = pprint_block_codes(block_desc)
        reprs.append(block_repr)
@@ -127,7 +128,7 @@ def pprint_block_codes(block_desc, show_backward=False):
    if type(block_desc) is not framework_pb2.BlockDesc:
        block_desc = framework_pb2.BlockDesc.FromString(
-            block_desc.serialize_to_string())
+            block_desc.desc.serialize_to_string())
    var_reprs = []
    op_reprs = []
    for var in block_desc.vars:
@@ -237,13 +238,13 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
    # draw parameters and args
    vars = {}
    for var in desc.vars:
-        shape = [str(i) for i in var.lod_tensor.tensor.dims]
+        # TODO(gongwb): format the var.type
-        if not shape:
-            shape = ['null']
        # create var
        if var.persistable:
            varn = graph.add_param(
-                var.name, var.type, shape, highlight=need_highlight(var.name))
+                var.name,
+                str(var.type).replace("\n", "<br />", 1),
+                highlight=need_highlight(var.name))
        else:
            varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
        vars[var.name] = varn
@@ -268,4 +269,4 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
        for var in op.outputs:
            add_op_link_var(opn, var, True)
-    graph(path, show=True)
+    graph(path, show=False)
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -102,6 +102,8 @@ def split_dense_variable(var_list,
        the parameter server side can gain better performance. By default
        minimum block size is 1024. The max block size is used to prevent
        very large blocks that may cause send error.
+        :return: A list of VarBlocks. Each VarBlock specifies a shard of
+           the var.
    """
    blocks = []
    for var in var_list:
@@ -192,22 +194,24 @@ class DistributeTranspiler:
        self.trainer_id = trainer_id
        pserver_endpoints = pservers.split(",")
-        # step1
+        # step1: For large parameters and gradients, split them into smaller
+        # blocks.
        param_list = [pg[0] for pg in params_grads]
        grad_list = [pg[1] for pg in params_grads]
        grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
        param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
-        # step2
+        # step2: Create new vars for the parameters and gradients blocks and
+        # add ops to do the split.
        grad_var_mapping = self._append_split_op(program, grad_blocks)
-        # step3
+        param_var_mapping = self._create_vars_from_blocklist(program,
+                                                             param_blocks)
+        # step3: Add gradients as send op inputs and parameters as send
+        # op outputs.
        send_inputs = []
        send_outputs = []
        for b in grad_blocks:  # append by order
            varname, block_id, _ = b.split(":")
            send_inputs.append(grad_var_mapping[varname][int(block_id)])
-        param_var_mapping = self._create_vars_from_blocklist(program,
-                                                             param_blocks)
        for b in param_blocks:
            varname, block_id, _ = b.split(":")
            send_outputs.append(param_var_mapping[varname][int(block_id)])
@@ -237,7 +241,7 @@ class DistributeTranspiler:
                     "RPCClient": rpc_client_var},
            attrs={"endpoints": pserver_endpoints,
                   "epmap": eplist})
-        # step4
+        # step4: Concat the parameters splits together after recv.
        for varname, splited_var in param_var_mapping.iteritems():
            if len(splited_var) <= 1:
                continue
@@ -251,6 +255,7 @@ class DistributeTranspiler:
    def get_trainer_program(self):
        # remove optimize ops and add a send op to main_program
        self.program.global_block().delete_ops(self.optimize_ops)
+        self.program.sync_with_cpp()
        # FIXME(typhoonzero): serialize once will fix error occurs when clone.
        self.program.__str__()
        return self.program
@@ -258,13 +263,14 @@ class DistributeTranspiler:
    def get_pserver_program(self, endpoint):
        """
        Get pserver side program using the endpoint.
+        TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
        NOTE: assume blocks of the same variable is not distributed
        on the same pserver, only change param/grad varnames for
        trainers to fetch.
        """
        # step1
        pserver_program = Program()
-        # step2
+        # step2: Create vars to receive vars at parameter servers.
        recv_inputs = []
        for v in self.param_grad_ep_mapping[endpoint]["params"]:
            self._clone_var(pserver_program.global_block(), v)
@@ -273,17 +279,21 @@ class DistributeTranspiler:
            # we don't need to create them when grad arrives.
            # change client side var name to origin name by
            # removing ".trainer_%d" suffix
            suff_idx = v.name.find(".trainer_")
            if suff_idx >= 0:
                orig_var_name = v.name[:suff_idx]
            else:
                orig_var_name = v.name
-            single_trainer_var = pserver_program.global_block().create_var(
+            # NOTE: single_trainer_var must be created for multi-trainer
-                name=orig_var_name,
+            # case to merge grads from multiple trainers
-                persistable=True,
+            single_trainer_var = \
-                type=v.type,
+                pserver_program.global_block().create_var(
-                dtype=v.dtype,
+                    name=orig_var_name,
-                shape=v.shape)
+                    persistable=True,
+                    type=v.type,
+                    dtype=v.dtype,
+                    shape=v.shape)
            if self.trainers > 1:
                for trainer_id in xrange(self.trainers):
                    var = pserver_program.global_block().create_var(
@@ -344,7 +354,7 @@ class DistributeTranspiler:
                self._append_pserver_non_opt_ops(block, op)
        append_block = optimize_block
-        # append lr decay ops to the child block if exits
+        # append lr decay ops to the child block if exists
        lr_ops = self._get_lr_ops()
        if len(lr_ops) > 0:
            for _, op in enumerate(lr_ops):
@@ -447,8 +457,10 @@ class DistributeTranspiler:
                                    block_list,
                                    add_trainer_suffix=False):
        """
+        Create vars for each split.
        NOTE: only grads need to be named for different trainers, use
              add_trainer_suffix to rename the grad vars.
+        :return: A dict mapping from original var name to each var split.
        """
        block_map = dict()
        var_mapping = dict()
@@ -615,6 +627,7 @@ class DistributeTranspiler:
                        type="sum",
                        inputs={"X": vars2merge},
                        outputs={"Out": merged_var})
+                    # TODO(panyx0718): What if it's SELECTED_ROWS.
                    if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
                        optimize_block.append_op(
                            type="scale",
@@ -638,7 +651,7 @@ class DistributeTranspiler:
                    shape=param_block.shape)
                new_inputs[key] = tmpvar
            elif key == "LearningRate":
-                # leraning rate variable has already be created by non-optimize op,
+                # learning rate variable has already be created by non-optimize op,
                # don't create it once again.
                lr_varname = opt_op.input(key)[0]
                if pserver_block.vars.has_key(lr_varname):
@@ -773,6 +786,7 @@ class DistributeTranspiler:
        return False
    def _get_input_map_from_op(self, varmap, op):
+        """Returns a dict from op input name to the vars in varmap."""
        iomap = dict()
        for key in op.input_names:
            vars = []
@@ -785,6 +799,7 @@ class DistributeTranspiler:
        return iomap
    def _get_output_map_from_op(self, varmap, op):
+        """Returns a dict from op output name to the vars in varmap."""
        iomap = dict()
        for key in op.output_names:
            vars = []
@@ -812,6 +827,7 @@ class DistributeTranspiler:
                find_ops.append(op)
        # make a union find struct by the ops in default_main_program
        ufind = UnionFind(block.ops)
        for op1 in block.ops:
            for op2 in block.ops:
                # NOTE: we need to skip all optimize ops, since it is connected

--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
 import numpy as np
 import layers
@@ -59,6 +60,9 @@ class Evaluator(object):
    """
    def __init__(self, name, **kwargs):
+        warnings.warn(
+            "The %s is deprecated, because maintain a modified program inside evaluator cause bug easily, please use fluid.metrics.%s instead."
+            % (self.__class__.__name__, self.__class__.__name__), Warning)
        self.states = []
        self.metrics = []
        self.helper = LayerHelper(name, **kwargs)

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -659,7 +659,7 @@ class Block(object):
    def __init__(self, program, idx):
        self.desc = program.desc.block(idx)
        self.vars = dict()  # var_name --> var
-        self.ops = collections.deque()  # operator list
+        self.ops = list()  # operator list
        self.program = program
        self.removed_vars = dict()
@@ -818,6 +818,11 @@ class Block(object):
        del self.vars[name]
        self.sync_with_cpp()
+    def remove_var(self, name):
+        self.sync_with_cpp()
+        self.desc.remove_var(name)
+        del self.vars[name]
    def create_parameter(self, *args, **kwargs):
        global_block = self.program.global_block()
        param = Parameter(global_block, *args, **kwargs)
@@ -831,6 +836,18 @@ class Block(object):
        self.ops.append(op)
        return op
+    def insert_op(self, index, *args, **kwargs):
+        self.sync_with_cpp()
+        op_desc = self.desc.insert_op(index)
+        op = Operator(block=self, desc=op_desc, *args, **kwargs)
+        self.ops.insert(index, op)
+        return op
+    def remove_op(self, index):
+        self.sync_with_cpp()
+        self.desc.remove_op(index, index + 1)
+        del self.ops[index]
    def delete_ops(self, ops):
        # remove from cpp
        # FIXME(typhoonzero): remove only the first occurrence.
@@ -839,15 +856,16 @@ class Block(object):
            end = list(self.ops).index(ops[-1])
        except Exception, e:
            raise e
        self.desc.remove_op(start, end + 1)
    def slice_ops(self, start, end):
-        return list(self.ops)[start:end]
+        return self.ops[start:end]
    def prepend_op(self, *args, **kwargs):
        op_desc = self.desc.prepend_op()
        op = Operator(self, op_desc, *args, **kwargs)
-        self.ops.appendleft(op)
+        self.ops.insert(0, op)
        return op
    def sync_with_cpp(self):
@@ -892,7 +910,7 @@ class Block(object):
        for index in range((start_index - 1 - 1), -1, -1):
            op_desc = ops_in_cpp[index]
            op = Operator(self, op_desc)
-            self.ops.appendleft(op)
+            self.ops.insert(0, op)
        # sync ops append to the end of cpp_ops
        for index in range((end_index + 1), len(ops_in_cpp)):
@@ -965,6 +983,13 @@ class Block(object):
        if var.type == core.VarDesc.VarType.STEP_SCOPES:
            ret_var = self.create_var(
                name=var.name, persistable=var.persistable, type=var.type)
+        elif var.type == core.VarDesc.VarType.SELECTED_ROWS:
+            ret_var = self.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                persistable=True)
        else:
            ret_var = self.create_var(
                name=var.name,

--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
@@ -83,7 +83,7 @@ class Graph(object):
        file = open(dot_path, 'w')
        file.write(self.__str__())
        image_path = os.path.join(
-            os.path.dirname(__file__), dot_path[:-3] + "pdf")
+            os.path.dirname(dot_path), dot_path[:-3] + "pdf")
        cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
        subprocess.Popen(
            cmd,
@@ -199,7 +199,7 @@ class GraphPreviewGenerator(object):
        else:
            self.graph.show(path)
-    def add_param(self, name, data_type, shape, highlight=False):
+    def add_param(self, name, data_type, highlight=False):
        label = '\n'.join([
            '<<table cellpadding="5">',
            '  <tr>',
@@ -214,11 +214,6 @@ class GraphPreviewGenerator(object):
            str(data_type),
            '    </td>'
            '  </tr>',
-            '  <tr>',
-            '    <td>',
-            '[%s]' % 'x'.join(shape),
-            '    </td>'
-            '  </tr>',
            '</table>>',
        ])
        return self.graph.node(

--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -18,7 +18,8 @@ import contextlib
 __all__ = [
    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
-    'init_on_cpu'
+    'init_on_cpu', 'ConstantInitializer', 'UniformInitializer',
+    'NormalInitializer', 'XavierInitializer'
 ]
 _force_init_on_cpu_ = False

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -21,8 +21,7 @@ from ..executor import global_scope
 __all__ = [
    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'open_files', 'read_file', 'create_shuffle_reader',
+    'open_files', 'read_file', 'shuffle', 'double_buffer'
-    'create_double_buffer_reader', 'create_multi_pass_reader'
 ]
@@ -237,13 +236,9 @@ def monkey_patch_reader_methods(reader):
        var = scope.find_var(reader.name)
        return var.get_reader()
-    def eof():
-        return not __get_reader__().has_next()
    def reset():
        return __get_reader__().reset()
-    reader.eof = eof
    reader.reset = reset
    reader.stop_gradient = True
    reader.persistable = True
@@ -283,7 +278,42 @@ def _copy_reader_create_op_(block, op):
    return new_op
-def open_recordio_file(filename, shapes, lod_levels, dtypes):
+def open_recordio_file(filename,
+                       shapes,
+                       lod_levels,
+                       dtypes,
+                       pass_num=1,
+                       for_parallel=False):
+    """
+    Open a RecordIO file
+    This layer takes a RecordIO file to read from and returns a Reader Variable.
+    Via the Reader Variable, we can get data from the given RecordIO file.
+    Args:
+       filename(str): The RecordIO file's name.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       dtypes(list): List of strs which declaring data type.
+       pass_num(int): Number of passes to run.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.
+    Returns:
+       Variable: A Reader Variable via which we can get RecordIO file data.
+    Examples:
+       .. code-block:: python
+         reader = fluid.layers.io.open_recordio_file(
+                                          filename='./data.recordio',
+                                          shapes=[(3,224,224), (1)],
+                                          lod_levels=[0, 0],
+                                          dtypes=['float32', 'int64'])
+         # Via the reader, we can use 'read_file' layer to get data:
+         image, label = fluid.layers.read_file(reader)
+    """
    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
    shape_concat = []
    ranks = []
@@ -310,10 +340,63 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
    startup_var.persistable = True
    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
                                      startup_var)
+    if pass_num > 1:
+        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+    if for_parallel:
+        main_prog_var = parallel(reader=main_prog_var)
    return monkey_patch_reader_methods(main_prog_var)
-def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
+def open_files(filenames,
+               shapes,
+               lod_levels,
+               dtypes,
+               thread_num,
+               buffer_size=None,
+               pass_num=1,
+               for_parallel=False):
+    """
+    Open files
+    This layer takes a list of files to read from and returns a Reader Variable. 
+    Via the Reader Variable, we can get data from given files. All files must 
+    have name suffixs to indicate their formats, e.g., '*.recordio'. 
+    Args:
+       filenames(list): The list of file names.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       dtypes(list): List of strs which declaring data type.
+       thread_num(int): The maximal concurrent prefetch thread number.
+       buffer_size(int): The size of prefetch buffer.
+       pass_num(int): Number of passes to run.
+       for_parallel(Bool): Set it as True if you are going to run 
+            subsequent operators in parallel.
+    Returns:
+       Variable: A Reader Variable via which we can get file data.
+    Examples:
+       .. code-block:: python
+         reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
+                                                     './data2.recordio'],
+                                             shapes=[(3,224,224), (1)],
+                                             lod_levels=[0, 0],
+                                             dtypes=['float32', 'int64'],
+                                             thread_num=2,
+                                             buffer_size=2)
+         # Via the reader, we can use 'read_file' layer to get data:
+         image, label = fluid.layers.io.read_file(reader)
+    """
+    if buffer_size is None:
+        buffer_size = thread_num
+    if isinstance(filenames, basestring):
+        filenames = [filenames]
    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
    shape_concat = []
    ranks = []
@@ -322,29 +405,36 @@ def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
        shape_concat.extend(shape)
        ranks.append(len(shape))
-    var_name = unique_name('multiple_reader')
+    multi_file_reader_name = unique_name('multi_file_reader')
    startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(name=var_name)
+    startup_reader = startup_blk.create_var(name=multi_file_reader_name)
    startup_blk.append_op(
        type='open_files',
-        outputs={'Out': [startup_var]},
+        outputs={'Out': [startup_reader]},
        attrs={
            'shape_concat': shape_concat,
            'lod_levels': lod_levels,
            'ranks': ranks,
            'file_names': filenames,
-            'thread_num': thread_num
+            'thread_num': thread_num,
+            'buffer_size': buffer_size
        })
-    startup_var.desc.set_dtypes(dtypes)
+    startup_reader.desc.set_dtypes(dtypes)
-    startup_var.persistable = True
+    startup_reader.persistable = True
-    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+    main_prog_reader = _copy_reader_var_(default_main_program().current_block(),
-                                      startup_var)
+                                         startup_reader)
-    return monkey_patch_reader_methods(main_prog_var)
+    if pass_num > 1:
+        main_prog_reader = multi_pass(
+            reader=main_prog_reader, pass_num=pass_num)
+    if for_parallel:
+        main_prog_reader = parallel(reader=main_prog_reader)
+    return monkey_patch_reader_methods(main_prog_reader)
-def __create_decorated_reader__(op_type, reader, attrs):
+def __create_shared_decorated_reader__(op_type, reader, attrs):
    var_name = unique_name(op_type)
    startup_blk = default_startup_program().current_block()
    startup_var = startup_blk.create_var(name=var_name)
@@ -360,22 +450,41 @@ def __create_decorated_reader__(op_type, reader, attrs):
    return monkey_patch_reader_methods(main_prog_var)
-def create_shuffle_reader(reader, buffer_size):
+def __create_unshared_decorated_reader__(op_type, reader, attrs):
-    return __create_decorated_reader__('create_shuffle_reader', reader,
+    new_reader_name = unique_name(op_type)
-                                       {'buffer_size': int(buffer_size)})
+    main_blk = default_main_program().current_block()
+    new_reader = main_blk.create_var(name=new_reader_name)
+    main_blk.append_op(
+        type=op_type,
+        inputs={'UnderlyingReader': reader},
+        outputs={'Out': [new_reader]},
+        attrs=attrs)
+    new_reader.persistable = True
+    new_reader.stop_gradient = True
+    return monkey_patch_reader_methods(new_reader)
+def shuffle(reader, buffer_size):
+    return __create_unshared_decorated_reader__(
+        'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
-def create_double_buffer_reader(reader, place=None):
+def double_buffer(reader, place=None):
    attrs = dict()
    if place is not None:
        attrs['place'] = str(place).upper()
-    return __create_decorated_reader__('create_double_buffer_reader', reader,
+    return __create_unshared_decorated_reader__('create_double_buffer_reader',
-                                       attrs)
+                                                reader, attrs)
+def multi_pass(reader, pass_num):
+    return __create_shared_decorated_reader__(
+        'create_multi_pass_reader', reader, {'pass_num': int(pass_num)})
-def create_multi_pass_reader(reader, pass_num):
+def parallel(reader):
-    return __create_decorated_reader__('create_multi_pass_reader', reader,
+    return __create_shared_decorated_reader__('create_threaded_reader', reader,
-                                       {'pass_num': int(pass_num)})
+                                              {})
 def read_file(file_obj):

--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -15,12 +15,13 @@
 All layers just related to metric.
 """
+import warnings
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-__all__ = ['accuracy']
+__all__ = ['accuracy', 'auc']
 def accuracy(input, label, k=1, correct=None, total=None):
@@ -55,3 +56,37 @@ def accuracy(input, label, k=1, correct=None, total=None):
            "Total": [total],
        })
    return acc_out
+def auc(input, label, curve='ROC', num_thresholds=200):
+    warnings.warn(
+        "This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \
+        but can not aggregate them and get the pass AUC, because pass \
+        auc can not be averaged with weighted from the minibatch auc value. \
+        Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
+        which can get every minibatch and every pass auc value.", Warning)
+    helper = LayerHelper("auc", **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": k})
+    auc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        attrs={"curve": curve,
+               "num_thresholds": num_thresholds},
+        outputs={"AUC": [auc_out], })
+    return auc_out
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fluid Metrics
+The metrics are accomplished via Python natively. 
+"""
+import numpy as np
+import copy
+import warnings
+__all__ = [
+    'MetricBase',
+    'CompositeMetric',
+    'Accuracy',
+    'ChunkEvaluator',
+    'EditDistance',
+    'DetectionMAP',
+    'Auc',
+]
+def _is_numpy_(var):
+    return isinstance(var, (np.ndarray, np.generic))
+def _is_number_(var):
+    return isinstance(var, int) or isinstance(var, float) or (isinstance(
+        var, np.ndarray) and var.shape == (1, ))
+def _is_number_or_matrix_(var):
+    return _is_number_(var) or isinstance(var, np.ndarray)
+class MetricBase(object):
+    """
+    Base Class for all evaluators
+    Args:
+        name(str): The name of evaluator. such as, "accuracy". Used for generate
+            temporary variable name.
+    Interface:
+        Note(*) : the states is the attributes who not has _ prefix.
+        get_config(): print current states and configuration
+        reset(): clear the states. If the Metrics states type is not (int, float, np.ndarray),
+                Please override this method.
+        update(): update states at every minibatch
+        eval(): get metric evaluation in numpy type.
+    """
+    def __init__(self, name, **kwargs):
+        self._name = str(name) if name != None else self.__class__.__name__
+        self._kwargs = kwargs if kwargs != None else dict()
+        self.reset()
+    def __str__(self):
+        return self._name
+    def reset(self):
+        """
+        states is the attributes who not has _ prefix.
+        reset the states of metrics.
+        """
+        states = {
+            attr: value
+            for attr, value in self.__dict__.iteritems()
+            if not attr.startswith("_")
+        }
+        for attr, value in states.iteritems():
+            if isinstance(value, int):
+                setattr(self, attr, 0)
+            elif isinstance(value, float):
+                setattr(self, attr, .0)
+            elif isinstance(value, (np.ndarray, np.generic)):
+                setattr(self, attr, np.zeros_like(value))
+            else:
+                setattr(self, attr, None)
+    def get_config(self):
+        states = {
+            attr: value
+            for attr, value in self.__dict__.iteritems()
+            if not attr.startswith("_")
+        }
+        config = copy.deepcopy(self._kwargs)
+        config.update({"name": self._name, "states": copy.deepcopy(states)})
+        return config
+    def update(self):
+        raise NotImplementedError()
+    def eval(self):
+        raise NotImplementedError()
+class CompositeMetric(MetricBase):
+    """
+    Compute multiple metrics in each minibatch.
+    for example, merge F1, accuracy, recall into one Metric.
+    """
+    def __init__(self, name=None, **kwargs):
+        super(CompositeMetric, self).__init__(name, kwargs)
+        self._metrics = []
+    def add_metric(self, metric):
+        if not isinstance(metric, MetricBase):
+            raise ValueError("SubMetric should be inherit from MetricBase.")
+        self._metrics.append(metric)
+    def eval(self):
+        ans = []
+        for m in self._metrics:
+            ans.append(m.eval())
+        return ans
+class Accuracy(MetricBase):
+    """
+    Accumulate the accuracy from minibatches and compute the average accuracy
+    for every pass.
+    Args:
+       name: the metrics name
+    Example:
+        minibatch_accuracy = fluid.layers.accuracy(pred, label)
+        accuracy_evaluator = fluid.metrics.Accuracy()
+        for epoch in PASS_NUM:
+            accuracy_evaluator.reset()
+            for data in batches:
+                loss = exe.run(fetch_list=[cost, minibatch_accuracy])
+            accuracy_evaluator.update(value=minibatch_accuracy, weight=batches)
+            accuracy = accuracy_evaluator.eval()
+    """
+    def __init__(self, name=None):
+        super(Accuracy, self).__init__(name)
+        self.value = .0
+        self.weight = .0
+    def update(self, value, weight):
+        if not _is_number_or_matrix_(value):
+            raise ValueError(
+                "The 'value' must be a number(int, float) or a numpy ndarray.")
+        if not _is_number_(weight):
+            raise ValueError("The 'weight' must be a number(int, float).")
+        self.value += value * weight
+        self.weight += weight
+    def eval(self):
+        if self.weight == 0:
+            raise ValueError(
+                "There is no data in Accuracy Metrics. Please check layers.accuracy output has added to Accuracy."
+            )
+        return self.value / self.weight
+class ChunkEvalutor(MetricBase):
+    """
+    Accumulate counter numbers output by chunk_eval from mini-batches and
+    compute the precision recall and F1-score using the accumulated counter
+    numbers.
+    """
+    def __init__(self, name=None):
+        super(ChunkEvalutor, self).__init__(name)
+        self.num_infer_chunks = 0
+        self.num_label_chunks = 0
+        self.num_correct_chunks = 0
+    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
+        if not _is_number_or_matrix_(num_infer_chunks):
+            raise ValueError(
+                "The 'num_infer_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        if not _is_number_or_matrix_(num_label_chunks):
+            raise ValueError(
+                "The 'num_label_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        if not _is_number_or_matrix_(num_correct_chunks):
+            raise ValueError(
+                "The 'num_correct_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        self.num_infer_chunks += num_infer_chunks
+        self.num_label_chunks += num_label_chunks
+        self.num_correct_chunks += num_correct_chunks
+    def eval(self):
+        precision = float(
+            self.num_correct_chunks
+        ) / self.num_infer_chunks if self.num_infer_chunks else 0
+        recall = float(self.num_correct_chunks
+                       ) / self.num_label_chunks if self.num_label_chunks else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if self.num_correct_chunks else 0
+        return precision, recall, f1_score
+class EditDistance(MetricBase):
+    """
+    Accumulate edit distance sum and sequence number from mini-batches and
+    compute the average edit_distance and instance error of all batches.
+    Args:
+        name: the metrics name
+    Example:
+        edit_distance_metrics = fluid.layers.edit_distance(input, label)
+        distance_evaluator = fluid.metrics.EditDistance()
+        for epoch in PASS_NUM:
+            distance_evaluator.reset()
+            for data in batches:
+                loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics))
+            distance_evaluator.update(*edit_distance_metrics)
+            distance, instance_error = distance_evaluator.eval()
+        In the above example:
+        'distance' is the average of the edit distance in a pass.
+        'instance_error' is the instance error rate in a pass.
+    """
+    def __init__(self, name):
+        super(EditDistance, self).__init__(name)
+        self.total_distance = .0
+        self.seq_num = 0
+        self.instance_error = 0
+    def update(self, distances, seq_num):
+        if not _is_numpy_(distances):
+            raise ValueError("The 'distances' must be a numpy ndarray.")
+        if not _is_number_(seq_num):
+            raise ValueError("The 'seq_num' must be a number(int, float).")
+        seq_right_count = np.sum(distances == 0)
+        total_distance = np.sum(distances)
+        self.seq_num += seq_num
+        self.instance_error += seq_num - seq_right_count
+        self.total_distance += total_distance
+    def eval():
+        if self.seq_num == 0:
+            raise ValueError(
+                "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
+            )
+        avg_distance = self.total_distance / self.seq_num
+        avg_instance_error = self.instance_error / self.seq_num
+        return avg_distance, avg_instance_error
+class DetectionMAP(MetricBase):
+    """
+    Calculate the detection mean average precision (mAP).
+    TODO (Dang Qingqing): update the following doc.
+    The general steps are as follows:
+    1. calculate the true positive and false positive according to the input
+        of detection and labels.
+    2. calculate mAP value, support two versions: '11 point' and 'integral'.
+    Please get more information from the following articles:
+      https://sanchom.wordpress.com/tag/average-precision/
+      https://arxiv.org/abs/1512.02325
+    """
+    def __init__(self, name=None):
+        super(DetectionMAP, self).__init__(name)
+        # the current map value
+        self.value = .0
+    def update(self, value, weight):
+        if not _is_number_or_matrix_(value):
+            raise ValueError(
+                "The 'value' must be a number(int, float) or a numpy ndarray.")
+        if not _is_number_(weight):
+            raise ValueError("The 'weight' must be a number(int, float).")
+        self.value += value
+        self.weight += weight
+    def eval(self):
+        if self.weight == 0:
+            raise ValueError(
+                "There is no data in DetectionMAP Metrics. "
+                "Please check layers.detection_map output has added to DetectionMAP."
+            )
+        return self.value / self.weight
+class Auc(MetricBase):
+    """
+    Auc Metrics which adapts to binary classification.
+    Need to note that auc metrics compute the value via Python natively.
+    If you concern the speed, please use the fluid.layers.auc instead.
+    The `auc` function creates four local variables, `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` that are used to
+      compute the AUC. To discretize the AUC curve, a linearly spaced set of
+      thresholds is used to compute pairs of recall and precision values. The area
+      under the ROC-curve is therefore computed using the height of the recall
+      values by the false positive rate, while the area under the PR-curve is the
+      computed using the height of the precision values by the recall.
+    Args:
+        name: metric name
+        curve: Specifies the name of the curve to be computed, 'ROC' [default] or
+          'PR' for the Precision-Recall-curve.
+        num_thresholds: The number of thresholds to use when discretizing the roc
+            curve.
+    "NOTE: only implement the ROC curve type via Python now."
+    """
+    def __init__(self, name, curve='ROC', num_thresholds=200):
+        super(MetricBase, self).__init__(name, curve, num_thresholds)
+        self._curve = curve
+        self._num_thresholds = num_thresholds
+        self._epsilon = 1e-6
+        self.tp_list = np.ndarray((num_thresholds, ))
+        self.fn_list = np.ndarray((num_thresholds, ))
+        self.tn_list = np.ndarray((num_thresholds, ))
+        self.fp_list = np.ndarray((num_thresholds, ))
+    def update(self, labels, predictions, axis=1):
+        if not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray.")
+        if not _is_numpy_(predictions):
+            raise ValueError("The 'predictions' must be a numpy ndarray.")
+        kepsilon = 1e-7  # to account for floating point imprecisions
+        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                      for i in range(num_thresholds - 2)]
+        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+        # caculate TP, FN, TN, FP count
+        for idx_thresh, thresh in enumerate(thresholds):
+            tp, fn, tn, fp = 0, 0, 0, 0
+            for i, lbl in enumerate(labels):
+                if lbl:
+                    if predictions[i, 0] >= thresh:
+                        tp += 1
+                    else:
+                        fn += 1
+                else:
+                    if predictions[i, 0] >= thresh:
+                        fp += 1
+                    else:
+                        tn += 1
+            tp_list[idx_thresh] += tp
+            fn_list[idx_thresh] += fn
+            tn_list[idx_thresh] += tn
+            fp_list[idx_thresh] += fp
+    def eval(self):
+        epsilon = self._epsilon
+        num_thresholds = self._num_thresholds
+        tpr = (tp_list.astype("float32") + epsilon) / (
+            tp_list + fn_list + epsilon)
+        fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
+        rec = (tp_list.astype("float32") + epsilon) / (
+            tp_list + fp_list + epsilon)
+        x = fpr[:num_thresholds - 1] - fpr[1:]
+        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
+        auc_value = np.sum(x * y)
+        return auc_value
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -22,10 +22,49 @@ __all__ = ['ParallelExecutor']
 class ParallelExecutor(object):
    def __init__(self,
-                 loss_name,
                 use_cuda,
+                 loss_name=None,
+                 main_program=None,
                 num_threads=None,
-                 allow_op_delay=False):
+                 allow_op_delay=False,
+                 share_vars_from=None):
+        """
+        ParallelExecutor can run program in parallel.
+        Args:
+            use_cuda(bool): Whether to use CUDA or not.
+            loss_name(str, default None): The loss name must set in training.
+            main_program(Program, default None): The program that need to run,
+                if not provided, then default_main_program will be used.
+            num_threads(int, default None): How many threads are used for
+                training.
+            allow_op_delay(bool, default False): Whether to delay and buffer
+                some operators together for scheduling or not, which may
+                improve performance in some cases, defalut False.
+            share_vars_from(ParallelExecutor, default None): If provied,
+                it will share variables from the specified ParallelExecutor.
+        Returns:
+            A ParallelExecutor object.
+        Raises:
+            TypeError: If share_vars_from is provided, but not ParallelExecutor
+                object.
+        Examples:
+            .. code-block:: python
+              train_exe = fluid.ParallelExecutor(
+                  use_cuda=True, loss_name=loss.name)
+              test_exe = fluid.ParallelExecutor(
+                  use_cuda=True,
+                  main_program=test_program,
+                  share_vars_from=train_exe)
+              train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
+              test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+        """
        self._places = []
        self._act_places = []
        if use_cuda:
@@ -48,12 +87,26 @@ class ParallelExecutor(object):
                # performance. Worth tunning for other models in the future.
                num_threads = len(self._places)
            else:
-                min(len(self._places) * 2, multiprocessing.cpu_count())
+                num_threads = min(
+                    len(self._places) * 2, multiprocessing.cpu_count())
-        startup = framework.default_startup_program()
+        main = main_program
-        main = framework.default_main_program()
+        main = main if main else framework.default_main_program()
        scope = executor.global_scope()
+        if share_vars_from and not isinstance(share_vars_from,
+                                              ParallelExecutor):
+            raise TypeError("share_vars_from must be ParallelExecutor.")
+        local_scopes = share_vars_from.executor.local_scopes(
+        ) if share_vars_from else []
+        self.persistable_vars = [
+            v.name
+            for v in filter(lambda var: \
+                var.persistable and var.type != core.VarDesc.VarType.RAW,
+                main.list_vars())
+        ]
        self.executor = core.ParallelExecutor(
            num_threads,
            True if use_cuda else False,  # use_event
@@ -62,10 +115,11 @@ class ParallelExecutor(object):
                p.name for p in main.global_block().iter_parameters()
                if not p.stop_gradient
            ]),
-            startup.desc,
+            set(self.persistable_vars),
            main.desc,
-            loss_name,
+            loss_name if loss_name else '',
            scope,
+            local_scopes,
            allow_op_delay)
        self.scope = scope
@@ -91,3 +145,6 @@ class ParallelExecutor(object):
        self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
        return [arr[i] for i in range(len(arr))]
+    def bcast_params(self):
+        self.executor.bcast_params(set(self.persistable_vars))
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -22,221 +22,504 @@ from scipy.special import expit
 class TestExp(OpTest):
    def setUp(self):
        self.op_type = "exp"
-        self.inputs = {
+        self.dtype = np.float32
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        self.init_dtype()
-        }
-        self.outputs = {'Out': np.exp(self.inputs['X'])}
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.exp(x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Exp(TestExp):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestSigmoid(OpTest):
    def setUp(self):
        self.op_type = "sigmoid"
-        self.inputs = {
+        self.dtype = np.float32
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        self.init_dtype()
-        }
-        self.outputs = {'Out': 1 / (1 + np.exp(-self.inputs['X']))}
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = 1 / (1 + np.exp(-x))
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.01)
+    def init_dtype(self):
+        pass
+class TestFP16Sigmoid(TestSigmoid):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestLogSigmoid(OpTest):
    def setUp(self):
        self.op_type = "logsigmoid"
-        self.inputs = {
+        self.dtype = np.float32
-            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        self.init_dtype()
-        }
-        self.outputs = {'Out': np.log(1 / (1 + np.exp(-self.inputs['X'])))}
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = np.log(1 / (1 + np.exp(-x)))
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+    def init_dtype(self):
+        pass
+class TestFP16LogSigmoid(TestLogSigmoid):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestTanh(OpTest):
    def setUp(self):
        self.op_type = "tanh"
-        self.inputs = {
+        self.dtype = np.float32
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        self.init_dtype()
-        }
-        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.tanh(x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Tanh(TestTanh):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestTanhShrink(OpTest):
    def setUp(self):
        self.op_type = "tanh_shrink"
-        self.inputs = {
+        self.dtype = np.float32
-            'X': np.random.uniform(0.1, 1, [10, 17]).astype("float32")
+        self.init_dtype()
-        }
-        self.outputs = {'Out': self.inputs['X'] - np.tanh(self.inputs['X'])}
+        x = np.random.uniform(0.1, 1, [10, 17]).astype(self.dtype)
+        out = x - np.tanh(x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+    def init_dtype(self):
+        pass
+class TestFP16TanhShrink(TestTanhShrink):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestHardShrink(OpTest):
    def setUp(self):
        self.op_type = "hard_shrink"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
        threshold = 0.5
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.copy(x)
+        out[(out >= -threshold) & (out <= threshold)] = 0
-        self.inputs = {'X': x}
        self.attrs = {'lambda': threshold}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        t = np.copy(x)
+        self.outputs = {'Out': out}
-        t[(t >= -threshold) & (t <= threshold)] = 0
-        self.outputs = {'Out': t}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.005)
+    def init_dtype(self):
+        pass
+class TestFP16HardShrink(TestHardShrink):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestSoftShrink(OpTest):
    def setUp(self):
        self.op_type = "softshrink"
+        self.dtype = np.float32
+        self.init_dtype()
        lambda_val = 0.1
+        x = np.random.uniform(0.25, 10, [4, 4]).astype(self.dtype)
+        out = np.copy(x)
+        out = (out < -lambda_val) * (out + lambda_val) + (out > lambda_val) * (
+            out - lambda_val)
        self.attrs = {'lambda': lambda_val}
-        self.inputs = {
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-            'X': np.random.uniform(0.25, 10, [4, 4]).astype("float32")
+        self.outputs = {'Out': out}
-        }
-        y = np.copy(self.inputs['X'])
-        y = (y < -lambda_val) * (y + lambda_val) + (y > lambda_val) * (
-            y - lambda_val)
-        self.outputs = {'Out': y}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16SoftShrink(TestSoftShrink):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestSqrt(OpTest):
    def setUp(self):
        self.op_type = "sqrt"
-        self.inputs = {
+        self.dtype = np.float32
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        self.init_dtype()
-        }
-        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.sqrt(x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Sqrt(TestSqrt):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestAbs(OpTest):
    def setUp(self):
        self.op_type = "abs"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
        # Because we set delta = 0.005 in caculating numeric gradient,
        # if x is too small, such as 0.002, x_neg will be -0.003
        # x_pos will be 0.007, so the numeric gradient is unaccurate.
        # we should avoid this
        x[np.abs(x) < 0.005] = 0.02
-        self.inputs = {'X': x}
+        out = np.abs(x)
-        self.outputs = {'Out': np.abs(self.inputs['X'])}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Abs(TestAbs):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestCeil(OpTest):
    def setUp(self):
        self.op_type = "ceil"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
-        self.inputs = {'X': x}
+        self.init_dtype()
-        self.outputs = {'Out': np.ceil(self.inputs['X'])}
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.ceil(x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Ceil(TestCeil):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestFloor(OpTest):
    def setUp(self):
        self.op_type = "floor"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
-        self.inputs = {'X': x}
+        self.init_dtype()
-        self.outputs = {'Out': np.floor(self.inputs['X'])}
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.floor(x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Floor(TestFloor):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestCos(OpTest):
    def setUp(self):
        self.op_type = "cos"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
-        self.inputs = {'X': x}
+        self.init_dtype()
-        self.outputs = {'Out': np.cos(self.inputs['X'])}
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.cos(x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Cos(TestCos):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestSin(OpTest):
    def setUp(self):
        self.op_type = "sin"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
-        self.inputs = {'X': x}
+        self.init_dtype()
-        self.outputs = {'Out': np.sin(self.inputs['X'])}
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.sin(x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Sin(TestSin):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestRound(OpTest):
    def setUp(self):
        self.op_type = "round"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
-        self.inputs = {'X': x}
+        self.init_dtype()
-        self.outputs = {'Out': np.round(self.inputs['X'])}
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.round(x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Round(TestRound):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestRelu(OpTest):
    def setUp(self):
@@ -278,222 +561,463 @@ class TestFP16Relu(TestRelu):
 class TestBRelu(OpTest):
    def setUp(self):
        self.op_type = "brelu"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
        t_min = 1.0
        t_max = 4.0
        # The same with TestAbs
        x[np.abs(x - t_min) < 0.005] = t_min + 0.02
        x[np.abs(x - t_max) < 0.005] = t_max + 0.02
-        self.inputs = {'X': x}
-        self.attrs = {'t_min': t_min, 't_max': t_max}
        t = np.copy(x)
        t[t < t_min] = t_min
        t[t > t_max] = t_max
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'t_min': t_min, 't_max': t_max}
        self.outputs = {'Out': t}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.02)
+    def init_dtype(self):
+        pass
+class TestFP16BRelu(TestBRelu):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestRelu6(OpTest):
    def setUp(self):
        self.op_type = "relu6"
-        x = np.random.uniform(-1, 1, [4, 10]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+        x = np.random.uniform(-1, 1, [4, 10]).astype(self.dtype)
        threshold = 6.0
        # The same with TestAbs
        x[np.abs(x) < 0.005] = 0.02
        x[np.abs(x - threshold) < 0.005] = threshold + 0.02
+        out = np.minimum(np.maximum(x, 0), threshold)
-        self.inputs = {'X': x}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
        self.attrs = {'threshold': threshold}
-        self.outputs = {
+        self.outputs = {'Out': out}
-            'Out': np.minimum(np.maximum(self.inputs['X'], 0), threshold)
-        }
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.02)
+    def init_dtype(self):
+        pass
+class TestFP16Relu6(TestRelu6):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestSoftRelu(OpTest):
    def setUp(self):
        self.op_type = "soft_relu"
-        x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+        x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype)
        threshold = 2.0
        # The same reason with TestAbs
        x[np.abs(x - threshold) < 0.005] = threshold + 0.02
        x[np.abs(x + threshold) < 0.005] = -threshold + 0.02
-        self.inputs = {'X': x}
-        self.attrs = {'threshold': threshold}
        t = np.copy(x)
        t[t < -threshold] = -threshold
        t[t > threshold] = threshold
-        self.outputs = {'Out': np.log((np.exp(t) + 1))}
+        out = np.log((np.exp(t) + 1))
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'threshold': threshold}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.02)
+    def init_dtype(self):
+        pass
+class TestFP16SoftRelu(TestSoftRelu):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestELU(OpTest):
    def setUp(self):
        self.op_type = "elu"
-        x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+        x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype)
        alpha = 1.
+        out = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
        # Note: unlike other Relu extensions, point 0 on standard ELU function (i.e. alpha = 1)
        # is differentiable, so we can skip modifications like x[np.abs(x) < 0.005] = 0.02 here
        self.inputs = {'X': x}
        self.attrs = {'alpha': alpha}
-        self.outputs = {
+        self.outputs = {'Out': out}
-            'Out': np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
-        }
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.02)
+    def init_dtype(self):
+        pass
+class TestFP16ELU(TestELU):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestReciprocal(OpTest):
    def setUp(self):
        self.op_type = "reciprocal"
-        self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
+        self.dtype = np.float32
-        self.outputs = {'Out': np.reciprocal(self.inputs['X'])}
+        self.init_dtype()
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.reciprocal(x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.01)
+    def init_dtype(self):
+        pass
+class TestFP16Reciprocal(TestReciprocal):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestLog(OpTest):
    def setUp(self):
        self.op_type = "log"
-        self.inputs = {
+        self.dtype = np.float32
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        self.init_dtype()
-        }
-        self.outputs = {'Out': np.log(self.inputs['X'])}
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.log(x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Log(TestLog):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestSquare(OpTest):
    def setUp(self):
        self.op_type = "square"
-        self.inputs = {
+        self.dtype = np.float32
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        self.init_dtype()
-        }
-        self.outputs = {'Out': np.square(self.inputs['X'])}
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.square(x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Square(TestSquare):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestPow(OpTest):
    def setUp(self):
        self.op_type = "pow"
-        self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
+        self.dtype = np.float32
+        self.init_dtype()
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.power(x, 3)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
        self.attrs = {'factor': 3.0}
-        self.outputs = {'Out': np.power(self.inputs['X'], 3)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.02)
+    def init_dtype(self):
+        pass
+class TestFP16Pow(TestPow):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=5e-2)
 class TestSTanh(OpTest):
    def setUp(self):
        self.op_type = "stanh"
-        self.inputs = {
+        self.dtype = np.float32
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        self.init_dtype()
-        }
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
        scale_a = 2.0 / 3.0
        scale_b = 1.7159
+        out = scale_b * np.tanh(x * scale_a)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
        self.attrs = {'scale_a': scale_a, 'scale_b': scale_b}
-        self.outputs = {'Out': scale_b * np.tanh(self.inputs['X'] * scale_a)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16STanh(TestSTanh):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestSoftplus(OpTest):
    def setUp(self):
        self.op_type = "softplus"
-        self.inputs = {
+        self.dtype = np.float64
-            'X': np.random.uniform(-1, 1, [11, 17]).astype("float64")
+        self.init_dtype()
-        }
-        self.outputs = {'Out': np.log(1 + np.exp(self.inputs['X']))}
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = np.log(1 + np.exp(x))
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Softplus(TestSoftplus):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestSoftsign(OpTest):
    def setUp(self):
        self.op_type = "softsign"
-        self.inputs = {
+        self.dtype = np.float32
-            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        self.init_dtype()
-        }
-        self.outputs = {
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-            'Out': np.divide(self.inputs['X'], 1 + np.abs(self.inputs['X']))
+        out = np.divide(x, 1 + np.abs(x))
-        }
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Softsign(TestSoftsign):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestThresholdedRelu(OpTest):
    def setUp(self):
        self.op_type = "thresholded_relu"
+        self.dtype = np.float32
+        self.init_dtype()
        threshold = 0.25
        self.relative_error = 0.005
-        X = np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        X = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
        # Same reason as TestAbs
        X[np.abs(X - threshold) < self.relative_error] = threshold + 0.2
+        out = (X > threshold) * X
-        self.inputs = {'X': X}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
        self.attrs = {'threshold': threshold}
-        self.outputs = {'Out': (X > threshold) * X}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=self.relative_error)
+    def init_dtype(self):
+        pass
+class TestFP16ThresholdedRelu(TestThresholdedRelu):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestHardSigmoid(OpTest):
    def setUp(self):
        self.op_type = "hard_sigmoid"
+        self.dtype = np.float32
+        self.init_dtype()
        self.relative_error = 0.002
        X = np.random.uniform(-5, 5, [2, 2]).astype("float32")
@@ -502,7 +1026,6 @@ class TestHardSigmoid(OpTest):
        lower_threshold = -offset / slope
        upper_threshold = (1 - offset) / slope
-        self.inputs = {'X': X}
        # Same reason as TestAbs
        X[np.abs(X - lower_threshold) < self.relative_error] = \
            lower_threshold + 0.2
@@ -510,34 +1033,103 @@ class TestHardSigmoid(OpTest):
            upper_threshold - 0.2
        temp = X * slope + offset
-        self.outputs = {'Out': np.maximum(0.0, np.minimum(1.0, temp))}
+        out = np.maximum(0.0, np.minimum(1.0, temp))
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.002)
+    def init_dtype(self):
+        pass
+class TestFP16HardSigmoid(TestHardSigmoid):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestSwish(OpTest):
    def setUp(self):
        self.op_type = "swish"
-        X = np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        self.dtype = np.float32
-        self.inputs = {'X': X}
+        self.init_dtype()
-        self.attrs = {'beta': 2.3}
-        self.outputs = {'Out': X * expit(self.attrs['beta'] * X)}
+        X = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        beta = 2.3
+        out = X * expit(beta * X)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
+        self.attrs = {'beta': beta}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+    def init_dtype(self):
+        pass
+class TestFP16Swish(TestSwish):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 #--------------------test MKLDNN--------------------
-class TestMKLDNNRelu(TestRelu):
+class TestMKLDNNReluDim2(TestRelu):
+    def setUp(self):
+        super(TestMKLDNNReluDim2, self).setUp()
+        self.attrs = {"use_mkldnn": True}
+class TestMKLDNNTanhDim2(TestTanh):
+    def setUp(self):
+        super(TestMKLDNNTanhDim2, self).setUp()
+        self.attrs = {"use_mkldnn": True}
+class TestMKLDNNSqrtDim2(TestSqrt):
+    def setUp(self):
+        super(TestMKLDNNSqrtDim2, self).setUp()
+        self.attrs = {"use_mkldnn": True}
+class TestMKLDNNAbsDim2(TestAbs):
+    def setUp(self):
+        super(TestMKLDNNAbsDim2, self).setUp()
+        self.attrs = {"use_mkldnn": True}
+class TestMKLDNNReluDim4(TestRelu):
    def setUp(self):
-        super(TestMKLDNNRelu, self).setUp()
+        super(TestMKLDNNReluDim4, self).setUp()
        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
        # The same reason with TestAbs
@@ -549,9 +1141,9 @@ class TestMKLDNNRelu(TestRelu):
        self.attrs = {"use_mkldnn": True}
-class TestMKLDNNTanh(TestTanh):
+class TestMKLDNNTanhDim4(TestTanh):
    def setUp(self):
-        super(TestMKLDNNTanh, self).setUp()
+        super(TestMKLDNNTanhDim4, self).setUp()
        self.inputs = {
            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
@@ -560,9 +1152,9 @@ class TestMKLDNNTanh(TestTanh):
        self.attrs = {"use_mkldnn": True}
-class TestMKLDNNSqrt(TestSqrt):
+class TestMKLDNNSqrtDim4(TestSqrt):
    def setUp(self):
-        super(TestMKLDNNSqrt, self).setUp()
+        super(TestMKLDNNSqrtDim4, self).setUp()
        self.inputs = {
            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
@@ -571,9 +1163,9 @@ class TestMKLDNNSqrt(TestSqrt):
        self.attrs = {"use_mkldnn": True}
-class TestMKLDNNAbs(TestAbs):
+class TestMKLDNNAbsDim4(TestAbs):
    def setUp(self):
-        super(TestMKLDNNAbs, self).setUp()
+        super(TestMKLDNNAbsDim4, self).setUp()
        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
        # The same reason with TestAbs

--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
@@ -51,7 +51,9 @@ class TestDebugger(unittest.TestCase):
            outputs={"Out": mul_out},
            attrs={"x_num_col_dims": 1})
-        print(debuger.pprint_program_codes(p.desc))
+        print(debuger.pprint_program_codes(p))
+        debuger.draw_block_graphviz(p.block(0), path="./test.dot")
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_multiple_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multiple_reader.py
@@ -61,8 +61,12 @@ class TestMultipleReader(unittest.TestCase):
            exe.run(fluid.default_startup_program())
            batch_count = 0
-            while not data_files.eof():
+            while True:
-                img_val, = exe.run(fetch_list=[img])
+                try:
+                    img_val, = exe.run(fetch_list=[img])
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
                batch_count += 1
                self.assertLessEqual(img_val.shape[0], self.batch_size)
            data_files.reset()

--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -44,7 +44,7 @@ class TestMultipleReader(unittest.TestCase):
                shapes=[(-1, 784), (-1, 1)],
                lod_levels=[0, 0],
                dtypes=['float32', 'int64'])
-            data_file = fluid.layers.create_multi_pass_reader(
+            data_file = fluid.layers.io.multi_pass(
                reader=data_file, pass_num=self.pass_num)
            img, label = fluid.layers.read_file(data_file)
@@ -57,8 +57,12 @@ class TestMultipleReader(unittest.TestCase):
            exe.run(fluid.default_startup_program())
            batch_count = 0
-            while not data_file.eof():
+            while True:
-                img_val, = exe.run(fetch_list=[img])
+                try:
+                    img_val, = exe.run(fetch_list=[img])
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
                batch_count += 1
                self.assertLessEqual(img_val.shape[0], self.batch_size)
            data_file.reset()

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -26,11 +26,14 @@ def simple_fc_net(use_feed):
        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    else:
-        reader = fluid.layers.open_recordio_file(
+        reader = fluid.layers.open_files(
-            filename='./mnist.recordio',
+            filenames=['./mnist.recordio'],
            shapes=[[-1, 784], [-1, 1]],
            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
        img, label = fluid.layers.read_file(reader)
    hidden = img
    for _ in xrange(4):
@@ -51,11 +54,14 @@ def fc_with_batchnorm(use_feed):
        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    else:
-        reader = fluid.layers.open_recordio_file(
+        reader = fluid.layers.open_files(
-            filename='./mnist.recordio',
+            filenames=['mnist.recordio'],
            shapes=[[-1, 784], [-1, 1]],
            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
        img, label = fluid.layers.read_file(reader)
    hidden = img
@@ -207,7 +213,11 @@ class TestParallelExecutorBase(unittest.TestCase):
            if memory_opt:
                fluid.memory_optimize(main)
-            exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True)
+            place = fluid.CUDAPlace(0)
+            startup_exe = fluid.Executor(place)
+            startup_exe.run(startup)
+            exe = fluid.ParallelExecutor(True, loss_name=loss.name)
            if batch_size is not None:
                batch_size *= fluid.core.get_cuda_device_count()
            begin = time.time()
@@ -453,3 +463,41 @@ class TestTransformer(TestParallelExecutorBase):
    @unittest.skip("transformer is buggy in multi gpu")
    def test_main(self):
        self.check_network_convergence(transformer)
+class ParallelExecutorTestingDuringTraining(unittest.TestCase):
+    def test_parallel_testing(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = simple_fc_net(True)
+            test_program = main.clone(for_test=True)
+            opt = fluid.optimizer.SGD(learning_rate=0.0001)
+            opt.minimize(loss)
+            batch_size = 32
+            image = numpy.random.normal(size=(batch_size,
+                                              784)).astype('float32')
+            label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64")
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup)
+            feed_dict = {'image': image, 'label': label}
+            train_exe = fluid.ParallelExecutor(
+                use_cuda=True, loss_name=loss.name, main_program=main)
+            test_exe = fluid.ParallelExecutor(
+                use_cuda=True,
+                main_program=test_program,
+                share_vars_from=train_exe)
+            for i in xrange(5):
+                test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+                test_loss = numpy.array(test_loss)
+                train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
+                train_loss = numpy.array(train_loss)
+                self.assertTrue(numpy.allclose(train_loss, test_loss))
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -201,24 +201,6 @@ class TestBlockDesc(unittest.TestCase):
        op1.set_type("test")
        op2.set_type("test")
-        var0 = block.var("var0")
-        var1 = block.var("var1")
-        var2 = block.var("var2")
-        var3 = block.var("var3")
-        var4 = block.var("var4")
-        var5 = block.var("var5")
-        op0.set_input("X", ["var0"])
-        op0.set_output("Y", ["var0"])
-        op1.set_input("X", ["var1", "var2"])
-        op1.set_output("Y", ["var3", "var4"])
-        op2.set_input("X", ["var1"])
-        op2.set_output("Y", ["var4", "var5"])
-        program.sync_with_cpp()
-        # remove op1, its input var2 and output var3 will be removed at the same time,
-        # but its input var1 and output var4 will not be removed since they are used for op2.
        block.remove_op(1, 2)
        program.sync_with_cpp()
@@ -226,8 +208,6 @@ class TestBlockDesc(unittest.TestCase):
        for idx in xrange(0, block.op_size()):
            all_ops.append(block.op(idx))
        self.assertEqual(all_ops, [op0, op2])
-        all_vars = block.all_vars()
-        self.assertEqual(set(all_vars), {var0, var1, var4, var5})
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -65,8 +65,13 @@ class TestRecordIO(unittest.TestCase):
            # train a pass
            batch_id = 0
-            while not data_file.eof():
+            while True:
-                tmp, = exe.run(fetch_list=[avg_loss])
+                try:
+                    tmp, = exe.run(fetch_list=[avg_loss])
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
                avg_loss_np.append(tmp)
                batch_id += 1
            data_file.reset()
@@ -74,8 +79,8 @@ class TestRecordIO(unittest.TestCase):
            self.assertLess(avg_loss_np[-1], avg_loss_np[0])
    def test_shuffle_reader(self):
-        self.test_main(decorator_callback=lambda reader: fluid.layers.create_shuffle_reader(reader, buffer_size=200))
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.shuffle(reader, buffer_size=200))
    def test_double_buffer_reader(self):
-        self.test_main(decorator_callback=lambda reader: fluid.layers.create_double_buffer_reader(reader,
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.double_buffer(reader,
                                                                                                  place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -102,7 +102,7 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
    package_data['py_paddle']=['*.py','_swig_paddle.so']
 package_dir={
-    '': '${CMAKE_CURRENT_SOURCE_DIR}',
+    '': '${PADDLE_BINARY_DIR}/python',
    # The paddle.fluid.proto will be generated while compiling.
    # So that package points to other directory.
    'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',