Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into modify_dev

75997999 · JiayiFeng · 5416bac5 · 44c346be · 75997999 · 75997999
51 changed file
--- a/benchmark/fluid/machine_translation.py
+++ b/benchmark/fluid/machine_translation.py
@@ -48,6 +48,13 @@ parser.add_argument(
    type=int,
    default=16,
    help="The sequence number of a mini-batch data. (default: %(default)d)")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
 parser.add_argument(
    "--dict_size",
    type=int,
@@ -72,16 +79,21 @@ parser.add_argument(
    default=3,
    help="The width for beam searching. (default: %(default)d)")
 parser.add_argument(
-    "--use_gpu",
-    type=distutils.util.strtobool,
-    default=True,
-    help="Whether to use gpu. (default: %(default)d)")
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
 parser.add_argument(
    "--max_length",
    type=int,
    default=250,
    help="The maximum length of sequence when doing generation. "
    "(default: %(default)d)")
+parser.add_argument(
+    '--with_test',
+    action='store_true',
+    help='If set, test the testset during training.')


 def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
@@ -281,7 +293,7 @@ def train():
            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
        batch_size=args.batch_size)

-    place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = Executor(place)
    exe.run(framework.default_startup_program())

@@ -307,14 +319,20 @@ def train():

        return total_loss / count

+    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in xrange(args.pass_num):
-        pass_start_time = time.time()
-        words_seen = 0
+        train_accs = []
+        train_losses = []
        for batch_id, data in enumerate(train_batch_generator()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
            src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
-            words_seen += word_num
+            num_samples += word_num
            trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
-            words_seen += word_num
+            num_samples += word_num
            lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)

            fetch_outs = exe.run(framework.default_main_program(),
@@ -325,24 +343,36 @@ def train():
                                 },
                                 fetch_list=[avg_cost])

-            avg_cost_val = np.array(fetch_outs[0])
-            print('pass_id=%d, batch_id=%d, train_loss: %f' %
-                  (pass_id, batch_id, avg_cost_val))
+            iters += 1
+            loss = np.array(fetch_outs[0])
+            print(
+                "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
+            )  # The accuracy is the accumulation of batches, but not the current batch.

-        pass_end_time = time.time()
-        test_loss = do_validation()
-        time_consumed = pass_end_time - pass_start_time
-        words_per_sec = words_seen / time_consumed
-        print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
-              (pass_id, test_loss, words_per_sec, time_consumed))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        if args.with_test:
+            test_loss = do_validation()
+        exit(0)


 def infer():
    pass


+def print_arguments(args):
+    print('----------- seq2seq Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
 if __name__ == '__main__':
    args = parser.parse_args()
+    print_arguments(args)
    if args.infer_only:
        infer()
    else:

--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -35,6 +35,12 @@ def parse_args():
    parser = argparse.ArgumentParser("mnist model benchmark.")
    parser.add_argument(
        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
    parser.add_argument(
        '--iterations', type=int, default=35, help='The number of minibatches.')
    parser.add_argument(
@@ -53,19 +59,14 @@ def parse_args():
        '--use_nvprof',
        action='store_true',
        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
    args = parser.parse_args()
    return args


-def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
 def cnn_model(data):
    conv_pool_1 = fluid.nets.simple_img_conv_pool(
        input=data,
@@ -161,16 +162,22 @@ def run_benchmark(model, args):
        paddle.dataset.mnist.train(), batch_size=args.batch_size)

    accuracy = fluid.average.WeightedAverage()
+    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        accuracy.reset()
-        pass_start = time.time()
+        train_accs = []
+        train_losses = []
        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
            img_data = np.array(
                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([len(y_data), 1])

-            start = time.time()
            outs = exe.run(
                fluid.default_main_program(),
                feed={"pixel": img_data,
@@ -178,21 +185,36 @@ def run_benchmark(model, args):
                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
            )  # The accuracy is the accumulation of batches, but not the current batch.
            accuracy.add(value=outs[1], weight=outs[2])
-            end = time.time()
+            iters += 1
+            num_samples += len(y_data)
            loss = np.array(outs[0])
            acc = np.array(outs[1])
-            print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
-                  (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+            train_losses.append(loss)
+            train_accs.append(acc)
+            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
+                  (pass_id, iters, loss, acc))
+
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed

-        pass_end = time.time()
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        if args.with_test:
+            test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
+                                     inference_program)
+        exit(0)

-        train_avg_acc = accuracy.eval()
-        test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
-                                 inference_program)

-        print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
-              (pass_id, train_avg_acc, test_avg_acc,
-               (pass_end - pass_start) / 1000))
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('----------- mnist Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')


 if __name__ == '__main__':

--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
@@ -87,15 +87,6 @@ def parse_args():
    return args


-def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
 def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
    conv1 = fluid.layers.conv2d(
        input=input,
@@ -279,32 +270,31 @@ def run_benchmark(model, args):
                      'label': label},
                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
            iters += 1
-            num_samples += label[0]
+            num_samples += len(label)
            accuracy.add(value=acc, weight=weight)
            train_losses.append(loss)
            train_accs.append(acc)
            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
                  (pass_id, iters, loss, acc))
-        pass_train_acc = accuracy.eval()
-        # evaluation
-        if args.with_test:
-            pass_test_acc = test(exe)
-        train_elapsed = time.time() - start_time
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
-
+        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
-
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        exit(0)

-    if args.use_cprof:
-        pr.disable()
-        s = StringIO.StringIO()
-        sortby = 'cumulative'
-        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
-        ps.print_stats()
-        print(s.getvalue())
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('----------- resnet Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')


 if __name__ == '__main__':

--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
 #!/bin/bash
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.
-export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
+
+#export FLAGS_fraction_of_gpu_memory_to_use=0.0
+export CUDNN_PATH=/paddle/cudnn_v5

 # disable openmp and mkl parallel
 #https://github.com/PaddlePaddle/Paddle/issues/7199
@@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0
 export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH

+# only query the gpu used
+nohup stdbuf -oL nvidia-smi \
+      --id=${CUDA_VISIBLE_DEVICES} \
+      --query-gpu=timestamp \
+      --query-compute-apps=pid,process_name,used_memory \
+      --format=csv \
+      --filename=mem.log  \
+      -l 1 &
+# mnist
+# mnist gpu mnist 128
+FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=500 \
+               2>&1 | tee -a mnist_gpu_128.log

 # vgg16
-# cifar10 gpu cifar10 128
-FLAGS_benchmark=true python fluid/vgg.py \
+# gpu cifar10 128
+FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
-               --iterations=30  \
-               2>&1 > vgg16_gpu_128.log
+               --iterations=30 \
+               2>&1 | tee -a vgg16_gpu_128.log
+
+# flowers gpu  128
+FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+               --device=GPU \
+               --batch_size=32 \
+               --data_set=flowers \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a vgg16_gpu_flowers_32.log

 # resnet50
 # resnet50 gpu cifar10 128
-FLAGS_benchmark=true python fluid/resnet.py \
+FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
               --device=GPU \
               --batch_size=128 \
               --data_set=cifar10 \
               --model=resnet_cifar10 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 > resnet50_gpu_128.log
+               2>&1 | tee -a resnet50_gpu_128.log
+
+# resnet50 gpu flowers 64
+FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+               --device=GPU \
+               --batch_size=64 \
+               --data_set=flowers \
+               --model=resnet_imagenet \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a resnet50_gpu_flowers_64.log

 # lstm
+# lstm gpu imdb 32 # tensorflow only support batch=32
+FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+               --device=GPU \
+               --batch_size=32 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               --hidden_dim=512 \
+               --emb_dim=512 \
+               --crop_size=1500 \
+               2>&1 | tee -a lstm_gpu_32.log
+
+# seq2seq
+# seq2seq gpu wmb 128
+FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a lstm_gpu_128.log
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@@ -37,6 +37,14 @@ def parse_args():
        type=int,
        default=32,
        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
    parser.add_argument(
        '--emb_dim',
        type=int,
@@ -64,6 +72,10 @@ def parse_args():
        default=int(os.environ.get('CROP_SIZE', '1500')),
        help='The max sentence length of input. Since this model use plain RNN,'
        ' Gradient could be explored if sentence is too long')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
    args = parser.parse_args()
    return args

@@ -157,37 +169,43 @@ def main():
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

-    def train_loop(pass_num, crop_size):
-        with profiler.profiler(args.device, 'total') as prof:
-            for pass_id in range(pass_num):
-                train_reader = batch(
-                    paddle.reader.shuffle(
-                        crop_sentence(imdb.train(word_dict), crop_size),
-                        buf_size=25000),
-                    batch_size=args.batch_size)
-                word_nums = 0
-                pass_start_time = time.time()
-                for batch_id, data in enumerate(train_reader()):
-                    tensor_words = to_lodtensor([x[0] for x in data], place)
-                    for x in data:
-                        word_nums += len(x[0])
-                    label = numpy.array([x[1] for x in data]).astype("int64")
-                    label = label.reshape((-1, 1))
-                    loss_np, acc, weight = exe.run(
-                        fluid.default_main_program(),
-                        feed={"words": tensor_words,
-                              "label": label},
-                        fetch_list=[loss, batch_acc, batch_size_tensor])
-                    print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" %
-                          (pass_id, batch_id, loss_np, acc))
-
-                pass_end_time = time.time()
-                time_consumed = pass_end_time - pass_start_time
-                words_per_sec = word_nums / time_consumed
-                print("pass_id=%d, sec/pass: %f, words/s: %f" %
-                      (pass_id, time_consumed, words_per_sec))
-
-    train_loop(args.pass_num, args.crop_size)
+    train_reader = batch(
+        paddle.reader.shuffle(
+            crop_sentence(imdb.train(word_dict), args.crop_size),
+            buf_size=25000),
+        batch_size=args.batch_size)
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            tensor_words = to_lodtensor([x[0] for x in data], place)
+            label = numpy.array([x[1] for x in data]).astype("int64")
+            label = label.reshape((-1, 1))
+            loss_np, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={"words": tensor_words,
+                      "label": label},
+                fetch_list=[loss, batch_acc, batch_size_tensor])
+            iters += 1
+            for x in data:
+                num_samples += len(x[0])
+            print(
+                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                (pass_id, iters, loss_np, acc)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+        exit(0)


 def to_lodtensor(data, place):
@@ -205,5 +223,14 @@ def to_lodtensor(data, place):
    return res


+def print_arguments(args):
+    print('----------- lstm Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
 if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
    main()
--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@@ -191,25 +191,29 @@ def main():
                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
            accuracy.add(value=acc, weight=weight)
            iters += 1
-            num_samples += len(data)
+            num_samples += len(y_data)
            print(
                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
                (pass_id, iters, loss, acc)
            )  # The accuracy is the accumulation of batches, but not the current batch.

-        pass_train_acc = accuracy.eval()
+        # pass_train_acc = accuracy.eval()
        train_losses.append(loss)
        train_accs.append(acc)
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+        train_elapsed = time.time() - start_time
+        examples_per_sec = num_samples / train_elapsed
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            pass_test_acc = test(exe)
-        train_elapsed = time.time() - start_time
-        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
-              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+        exit(0)


 def print_arguments():
-    print('-----------  Configuration Arguments -----------')
+    print('----------- vgg Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')

--- a/benchmark/tensorflow/machine_translation.py
+++ b/benchmark/tensorflow/machine_translation.py
--- a/benchmark/tensorflow/mnist.py
+++ b/benchmark/tensorflow/mnist.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+import numpy as np
+
+import tensorflow as tf
+import paddle.v2 as paddle
+
+DTYPE = tf.float32
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("mnist model benchmark.")
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--iterations', type=int, default=35, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=5, help='The number of passes.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    args = parser.parse_args()
+    return args
+
+
+def run_benchmark(args):
+    def weight_variable(dtype, shape):
+        initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
+        return tf.Variable(initial)
+
+    def bias_variable(dtype, shape):
+        initial = tf.constant(0.1, shape=shape, dtype=dtype)
+        return tf.Variable(initial)
+
+    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
+    with tf.device(device):
+        images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+
+        # conv1, relu, pool1
+        conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
+        conv1_bias = bias_variable(DTYPE, [20])
+        conv1 = tf.nn.conv2d(
+            images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
+        relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
+        pool1 = tf.nn.max_pool(
+            relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
+
+        # conv2, relu, pool2
+        conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
+        conv2_bias = bias_variable(DTYPE, [50])
+        conv2 = tf.nn.conv2d(
+            pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
+        relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
+        pool2 = tf.nn.max_pool(
+            relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
+
+        # FC 
+        pool_shape = pool2.get_shape().as_list()
+        hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
+        reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
+        fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
+        fc_bias = bias_variable(DTYPE, [10])
+        logits = tf.matmul(reshape, fc_weights) + fc_bias
+
+        # Get prediction
+        prediction = tf.nn.softmax(logits)
+
+        # Loss 
+        one_hot_labels = tf.one_hot(labels, depth=10)
+        cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
+        avg_cost = tf.reduce_mean(cost)
+
+        # Get accuracy
+        correct = tf.equal(tf.argmax(prediction, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        # metrics, g_accuracy
+        with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
+            g_accuracy = tf.metrics.accuracy(
+                labels, tf.argmax(
+                    prediction, axis=1))
+            vars = tf.contrib.framework.get_variables(
+                scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
+            g_accuracy_reset_op = tf.variables_initializer(vars)
+
+        # Optimizer 
+        opt = tf.train.AdamOptimizer(
+            learning_rate=0.001, beta1=0.9, beta2=0.999)
+        train_op = opt.minimize(avg_cost)
+        # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+
+    def eval_test():
+        sess.run(g_accuracy_reset_op)
+        for batch_id, data in enumerate(test_reader()):
+            images_data = np.array(
+                map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
+            labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
+
+            loss, acc, g_acc = sess.run(
+                [avg_cost, accuracy, g_accuracy],
+                feed_dict={images: images_data,
+                           labels: labels_data})
+        return g_acc[1]
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+        for pass_id in range(args.pass_num):
+            sess.run(g_accuracy_reset_op)
+
+            pass_start = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                images_data = np.array(
+                    map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
+                labels_data = np.array(map(lambda x: x[1], data)).astype(
+                    "int64")
+
+                start = time.time()
+                _, loss, acc, g_acc = sess.run(
+                    [train_op, avg_cost, accuracy, g_accuracy],
+                    feed_dict={images: images_data,
+                               labels: labels_data})
+                end = time.time()
+
+                print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
+                      (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+
+            pass_end = time.time()
+            test_avg_acc = eval_test()
+
+            print(
+                "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
+                % (pass_id, g_acc[1], test_avg_acc,
+                   (pass_end - pass_start) / 1000))
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    run_benchmark(args)
--- a/benchmark/tensorflow/resnet.py
+++ b/benchmark/tensorflow/resnet.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py
+
+Get help: python resnet.py --help
+See performance on flowers: python resnet.py
+Train on cifar10: python resnet.py --data=cifar10 --with_test
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+import numpy as np
+
+import paddle.v2 as paddle
+import tensorflow as tf
+
+DTYPE = tf.float32
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Convolution model benchmark.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=['resnet'],
+        default='resnet',
+        help='The model architecture.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='use real data or fake data')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations',
+        type=int,
+        default=105,
+        help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=300, help='The number of passes.')
+    parser.add_argument(
+        '--order',
+        type=str,
+        default='NHWC',
+        choices=['NCHW', 'NHWC'],
+        help='The data order, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--data',
+        type=str,
+        default='flowers102',
+        choices=['flowers102', 'cifar10'],
+        help='The kinds of data.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[
+        'with_test'] else vars(args)['iterations']
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def fixed_padding(inputs, kernel_size, data_format):
+    """Pads the input along the spatial dimensions independently of input size.
+  Args:
+    inputs: A tensor of size [batch, channels, height_in, width_in] or
+      [batch, height_in, width_in, channels] depending on data_format.
+    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
+                 Should be a positive integer.
+    data_format: The input format ('channels_last' or 'channels_first').
+  Returns:
+    A tensor with the same format as the input with the data either intact
+    (if kernel_size == 1) or padded (if kernel_size > 1).
+  """
+    pad_total = kernel_size - 1
+    pad_beg = pad_total // 2
+    pad_end = pad_total - pad_beg
+
+    if data_format == 'channels_first':
+        padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end],
+                                        [pad_beg, pad_end]])
+    else:
+        padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
+                                        [pad_beg, pad_end], [0, 0]])
+    return padded_inputs
+
+
+def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
+    """Strided 2-D convolution with explicit padding."""
+    # The padding is consistent and is based only on `kernel_size`, not on the
+    # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
+    # This is consistent with PaddlePaddle.
+    # In addition, the calculation for output size in TensorFlow can refer: 
+    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc
+    if strides > 1:
+        inputs = fixed_padding(inputs, kernel_size, data_format)
+
+    return tf.layers.conv2d(
+        inputs=inputs,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=('SAME' if strides == 1 else 'VALID'),
+        use_bias=False,
+        kernel_initializer=tf.variance_scaling_initializer(),
+        data_format=data_format)
+
+
+def conv_bn(inputs,
+            filters,
+            kernel_size,
+            strides,
+            is_training,
+            data_format,
+            act=True):
+    # def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
+    # set fused=True for a significant performance boost. See
+    # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
+    inputs = conv2d_fixed_padding(
+        inputs=inputs,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        data_format=data_format)
+    inputs = tf.layers.batch_normalization(
+        inputs=inputs,
+        axis=1 if data_format == 'channels_first' else 3,
+        momentum=0.9,
+        epsilon=1e-05,
+        center=True,
+        scale=True,
+        training=is_training,
+        fused=True)
+    if act:
+        inputs = tf.nn.relu(inputs)
+    return inputs
+
+
+def basicblock(inputs, filters, is_training, projection_shortcut, strides,
+               data_format):
+    shortcut = inputs
+    if projection_shortcut is not None:
+        shortcut = projection_shortcut(inputs)
+    inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format)
+    inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
+    inputs = inputs + shortcut
+    inputs = tf.nn.relu(inputs)
+    return inputs
+
+
+def bottleneck(inputs, filters, is_training, projection_shortcut, strides,
+               data_format):
+    shortcut = inputs
+    if projection_shortcut is not None:
+        shortcut = projection_shortcut(inputs)
+    inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format)
+    inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
+    inputs = conv_bn(
+        inputs, filters * 4, 1, 1, is_training, data_format, act=False)
+    inputs = inputs + shortcut
+    inputs = tf.nn.relu(inputs)
+    return inputs
+
+
+def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name,
+                data_format):
+    # Bottleneck blocks end with 4x the number of filters as they start with
+    filters_out = 4 * filters if block_fn is bottleneck else filters
+
+    def projection_shortcut(inputs):
+        return conv2d_fixed_padding(
+            inputs=inputs,
+            filters=filters_out,
+            kernel_size=1,
+            strides=strides,
+            data_format=data_format)
+
+    # Only the first block per block_layer uses projection_shortcut and strides
+    inputs = block_fn(inputs, filters, is_training, projection_shortcut,
+                      strides, data_format)
+
+    for _ in range(1, blocks):
+        inputs = block_fn(inputs, filters, is_training, None, 1, data_format)
+
+    return tf.identity(inputs, name)
+
+
+def resnet_imagenet(depth, class_dim, data_format):
+    """Returns the ResNet model for a given size and number of output classes."""
+
+    def resnet_generator(block_fn,
+                         layers,
+                         num_classes,
+                         data_format='channels_last'):
+        if data_format is None:
+            data_format = ('channels_first'
+                           if tf.test.is_built_with_cuda() else 'channels_last')
+
+        def model(inputs, is_training):
+            """Constructs the ResNet model given the inputs."""
+            if data_format == 'channels_first':
+                # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
+                # This provides a large performance boost on GPU. See
+                # https://www.tensorflow.org/performance/performance_guide#data_formats
+                inputs = tf.transpose(inputs, [0, 3, 1, 2])
+
+            inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format)
+            inputs = tf.identity(inputs, 'initial_conv')
+            inputs = tf.layers.max_pooling2d(
+                inputs=inputs,
+                pool_size=3,
+                strides=2,
+                padding='SAME',
+                data_format=data_format)
+            inputs = tf.identity(inputs, 'initial_max_pool')
+            inputs = block_layer(inputs, 64, block_fn, layers[0], 1,
+                                 is_training, 'block_layer1', data_format)
+            inputs = block_layer(inputs, 128, block_fn, layers[1], 2,
+                                 is_training, 'block_layer2', data_format)
+            inputs = block_layer(inputs, 256, block_fn, layers[2], 2,
+                                 is_training, 'block_layer3', data_format)
+            inputs = block_layer(inputs, 512, block_fn, layers[3], 2,
+                                 is_training, 'block_layer4', data_format)
+            inputs = tf.layers.average_pooling2d(
+                inputs=inputs,
+                pool_size=7,
+                strides=1,
+                padding='VALID',
+                data_format=data_format)
+            inputs = tf.identity(inputs, 'final_avg_pool')
+            inputs = tf.reshape(inputs,
+                                [-1, 512 if block_fn is basicblock else 2048])
+            inputs = tf.layers.dense(inputs=inputs, units=num_classes)
+            inputs = tf.identity(inputs, 'final_dense')
+            return inputs
+
+        return model
+
+    model_params = {
+        18: {
+            'block': basicblock,
+            'layers': [2, 2, 2, 2]
+        },
+        34: {
+            'block': basicblock,
+            'layers': [3, 4, 6, 3]
+        },
+        50: {
+            'block': bottleneck,
+            'layers': [3, 4, 6, 3]
+        },
+        101: {
+            'block': bottleneck,
+            'layers': [3, 4, 23, 3]
+        },
+        152: {
+            'block': bottleneck,
+            'layers': [3, 8, 36, 3]
+        },
+        200: {
+            'block': bottleneck,
+            'layers': [3, 24, 36, 3]
+        }
+    }
+    if depth not in model_params:
+        raise ValueError('Not a valid depth:', depth)
+    params = model_params[depth]
+    return resnet_generator(params['block'], params['layers'], class_dim,
+                            data_format)
+
+
+def resnet_cifar10(depth, num_classes, data_format):
+    if depth % 6 != 2:
+        raise ValueError('depth must be 6n + 2:', depth)
+
+    num_blocks = (depth - 2) // 6
+
+    if data_format is None:
+        data_format = ('channels_first'
+                       if tf.test.is_built_with_cuda() else 'channels_last')
+
+    def model(inputs, is_training):
+        inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format)
+        inputs = tf.identity(inputs, 'initial_conv')
+        inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training,
+                             'block_layer1', data_format)
+        inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training,
+                             'block_layer2', data_format)
+        inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training,
+                             'block_layer3', data_format)
+        inputs = tf.layers.average_pooling2d(
+            inputs=inputs,
+            pool_size=8,
+            strides=1,
+            padding='VALID',
+            data_format=data_format)
+        inputs = tf.identity(inputs, 'final_avg_pool')
+        inputs = tf.reshape(inputs, [-1, 64])
+        inputs = tf.layers.dense(inputs=inputs, units=num_classes)
+        inputs = tf.identity(inputs, 'final_dense')
+        return inputs
+
+    return model
+
+
+def run_benchmark(args, data_format='channels_last', device='/cpu:0'):
+    """Our model_fn for ResNet to be used with our Estimator."""
+
+    class_dim = 1000
+    dshape = (None, 224, 224, 3)
+
+    pdshape = (3, 224, 224)
+    if args.data == 'flowers102':
+        class_dim = 102
+        dshape = (None, 224, 224, 3)
+        pdshape = (3, 224, 224)
+    elif args.data == 'cifar10':
+        class_dim = 10
+        dshape = (None, 32, 32, 3)
+        pdshape = (3, 32, 32)
+
+    with tf.device(device):
+        images = tf.placeholder(DTYPE, shape=dshape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+
+        network = resnet_cifar10(
+            32, class_dim,
+            data_format) if args.data == 'cifar10' else resnet_imagenet(
+                50, class_dim, data_format)
+
+        logits = network(inputs=images, is_training=is_training)
+
+        cross_entropy = tf.losses.softmax_cross_entropy(
+            logits=logits, onehot_labels=onehot_labels)
+        avg_cost = tf.reduce_mean(cross_entropy)
+
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        lr = 0.1 if args.data == 'cifar10' else 0.01
+        optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
+
+        # Batch norm requires update_ops to be added as a train_op dependency.
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=100)
+
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+                map(lambda x: np.transpose(x[0].reshape(pdshape),
+                axes=[1, 2, 0]), data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" %
+              (pass_id, num_samples / train_elapsed, np.mean(test_accs)))
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+
+        if args.use_fake_data:
+            data = train_reader().next()
+            images_data = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(pdshape),
+                    axes=[1, 2, 0]), data)).astype("float32")
+            labels_data = np.array(map(lambda x: x[1], data)).astype('int64')
+        iters, num_samples, start_time = 0, 0, 0.0
+        for pass_id in range(args.pass_num):
+            if iters == args.iterations:
+                break
+            train_accs = []
+            train_losses = []
+            for batch_id, data in enumerate(train_reader()):
+                if iters == args.skip_batch_num:
+                    start_time = time.time()
+                    num_samples = 0
+                if iters == args.iterations:
+                    break
+                if not args.use_fake_data:
+                    images_data = np.array(
+                        map(lambda x: np.transpose(x[0].reshape(pdshape),
+                        axes=[1, 2, 0]), data)).astype("float32")
+                    labels_data = np.array(map(lambda x: x[1], data)).astype(
+                        'int64')
+                _, loss, acc = sess.run([train_op, avg_cost, accuracy],
+                                        feed_dict={
+                                            images: images_data,
+                                            labels: labels_data,
+                                            is_training: True
+                                        })
+                iters += 1
+                train_accs.append(acc)
+                train_losses.append(loss)
+                num_samples += len(data)
+                print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" %
+                      (pass_id, iters, loss, acc))
+
+            train_elapsed = time.time() - start_time
+            print("Pass=%d, Loss=%f, Accuray=%f\n" %
+                  (pass_id, np.mean(train_losses), np.mean(train_accs)))
+
+            # evaluation
+            if args.with_test:
+                test()
+
+        if not args.with_test:
+            duration = time.time() - start_time
+            examples_per_sec = num_samples / duration
+            sec_per_batch = duration / (iters - args.skip_batch_num)
+
+            print('Total examples: %d, total time: %.5f' %
+                  (num_samples, duration))
+            print('%.5f examples/sec, %.5f sec/batch' %
+                  (examples_per_sec, sec_per_batch))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if tf.test.is_built_with_cuda():
+        device = '/device:GPU:0'
+        if args.order == 'NHWC':
+            data_format = 'channels_last'
+        else:
+            data_format = 'channels_first'
+    else:
+        device = '/cpu:0'
+        if args.order == 'NHWC':
+            data_format = 'channels_last'
+        else:
+            raise ValueError('Only support NHWC order in CPU mode')
+
+    run_benchmark(args, data_format, device)
--- a/benchmark/tensorflow/stacked_dynamic_lstm.py
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import tensorflow as tf
+
+import paddle.v2 as paddle
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("LSTM model benchmark.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--stacked_num',
+        type=int,
+        default=5,
+        help='Number of lstm layers to stack. (default: %(default)d)')
+    parser.add_argument(
+        '--embedding_dim',
+        type=int,
+        default=512,
+        help='Dimension of embedding table. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=512,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=10,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=0.0002,
+        help='Learning rate used to train. (default: %(default)f)')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def dynamic_lstm_model(dict_size,
+                       embedding_dim,
+                       hidden_dim,
+                       stacked_num,
+                       class_num=2,
+                       is_train=True):
+    word_idx = tf.placeholder(tf.int64, shape=[None, None])
+    sequence_length = tf.placeholder(tf.int64, shape=[None, ])
+
+    embedding_weights = tf.get_variable('word_embeddings',
+                                        [dict_size, embedding_dim])
+    embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
+
+    lstm_cell = tf.nn.rnn_cell.LSTMCell(
+        num_units=hidden_dim, use_peepholes=False)
+    stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
+
+    # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
+    _, final_state = tf.nn.dynamic_rnn(
+        cell=stacked_cell,
+        inputs=embedding,
+        dtype=tf.float32,
+        sequence_length=sequence_length)
+
+    w = tf.Variable(
+        tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
+    bias = tf.Variable(
+        tf.constant(
+            value=0.0, shape=[class_num], dtype=tf.float32))
+    prediction = tf.matmul(final_state[-1][1], w) + bias
+
+    if not is_train:
+        return (word_idx, sequence_length), tf.nn.softmax(prediction)
+
+    label = tf.placeholder(tf.int64, shape=[None, ])
+    loss = tf.nn.softmax_cross_entropy_with_logits(
+        labels=tf.one_hot(label, 2), logits=prediction)
+    avg_loss = tf.reduce_mean(loss)
+
+    correct_count = tf.equal(tf.argmax(prediction, 1), label)
+    acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
+
+    with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
+        g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
+        vars = tf.contrib.framework.get_variables(
+            scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
+        reset_op = tf.variables_initializer(vars)
+
+    return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
+
+
+def padding_data(data, padding_size, value):
+    data = data + [value] * padding_size
+    return data[:padding_size]
+
+
+def train(args):
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_size = len(word_dict)
+
+    feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
+        dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
+
+    adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+    train_op = adam_optimizer.minimize(avg_loss)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=args.batch_size)
+
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.test(word_dict), buf_size=25000),
+        batch_size=args.batch_size)
+
+    def do_validation(sess):
+        sess.run(reset_op)
+        for batch_id, data in enumerate(test_reader()):
+            word_idx = map(lambda x: x[0], data)
+            sequence_length = np.array(
+                [len(seq) for seq in word_idx]).astype('int64')
+            maxlen = np.max(sequence_length)
+            word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
+            word_idx = np.array(word_idx).astype('int64')
+            label = np.array(map(lambda x: x[1], data)).astype('int64')
+
+            _, loss, fetch_acc, fetch_g_acc = sess.run(
+                [train_op, avg_loss, acc, g_acc],
+                feed_dict={
+                    feeding_list[0]: word_idx,
+                    feeding_list[1]: sequence_length,
+                    feeding_list[2]: label
+                })
+
+        return fetch_g_acc[1]
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_l)
+        sess.run(init_g)
+
+        for pass_id in xrange(args.pass_num):
+            # clear accuracy local variable 
+            sess.run(reset_op)
+            pass_start_time = time.time()
+            words_seen = 0
+
+            for batch_id, data in enumerate(train_reader()):
+                word_idx = map(lambda x: x[0], data)
+                sequence_length = np.array(
+                    [len(seq) for seq in word_idx]).astype('int64')
+                words_seen += np.sum(sequence_length)
+                maxlen = np.max(sequence_length)
+                word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
+                word_idx = np.array(word_idx).astype('int64')
+                label = np.array(map(lambda x: x[1], data)).astype('int64')
+
+                _, loss, fetch_acc, fetch_g_acc = sess.run(
+                    [train_op, avg_loss, acc, g_acc],
+                    feed_dict={
+                        feeding_list[0]: word_idx,
+                        feeding_list[1]: sequence_length,
+                        feeding_list[2]: label
+                    })
+
+                print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
+                      % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
+
+            pass_end_time = time.time()
+            time_consumed = pass_end_time - pass_start_time
+            words_per_sec = words_seen / time_consumed
+            test_acc = do_validation(sess)
+            print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
+                  (pass_id, test_acc, words_per_sec, time_consumed))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+
+    if args.infer_only:
+        pass
+    else:
+        train(args)
--- a/benchmark/tensorflow/vgg.py
+++ b/benchmark/tensorflow/vgg.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in TensorFlow"""
+import tensorflow as tf
+import paddle.v2 as paddle
+import numpy as np
+import argparse
+import time
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NHWC',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, NCHW=[batch, channels, height, width].'
+    'Only support NHWC right now.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+args = parser.parse_args()
+
+
+class VGG16Model(object):
+    def __init__(self):
+        self.parameters = []
+
+    def batch_norm_relu(self, inputs, is_training):
+        """Performs a batch normalization followed by a ReLU."""
+        # We set fused=True for a significant speed boost. See
+        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
+        inputs = tf.layers.batch_normalization(
+            inputs=inputs,
+            axis=1 if args.data_format == 'NCHW' else -1,
+            momentum=0.9,
+            epsilon=1e-05,
+            center=True,
+            scale=True,
+            training=is_training,
+            fused=True)
+        inputs = tf.nn.relu(inputs)
+        return inputs
+
+    def conv_bn_layer(self,
+                      name,
+                      images,
+                      kernel_shape,
+                      is_training,
+                      drop_rate=0.0):
+        with tf.name_scope(name) as scope:
+            kernel = tf.Variable(
+                tf.truncated_normal(
+                    kernel_shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            conv = tf.nn.conv2d(
+                images,
+                kernel, [1, 1, 1, 1],
+                data_format=args.data_format,
+                padding='SAME')
+            biases = tf.Variable(
+                tf.constant(
+                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(conv, biases)
+            out = self.batch_norm_relu(out, is_training)
+            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
+            return out
+
+    def fc_layer(self, name, inputs, shape):
+        with tf.name_scope(name) as scope:
+            fc_w = tf.Variable(
+                tf.truncated_normal(
+                    shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            fc_b = tf.Variable(
+                tf.constant(
+                    0.0, shape=[shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
+            return out
+
+    def network(self, images, class_dim, is_training):
+        """ VGG16 model structure.
+
+            TODO(kuke): enable this network to support the 'NCHW' data format
+        """
+
+        # conv1
+        conv1_1 = self.conv_bn_layer(
+            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
+        conv1_2 = self.conv_bn_layer(
+            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
+        # pool1
+        pool1 = tf.nn.max_pool(
+            conv1_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool1')
+        # conv2
+        conv2_1 = self.conv_bn_layer(
+            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
+        conv2_2 = self.conv_bn_layer(
+            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
+        # pool2
+        pool2 = tf.nn.max_pool(
+            conv2_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool2')
+        # conv3
+        conv3_1 = self.conv_bn_layer(
+            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
+        conv3_2 = self.conv_bn_layer(
+            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
+        conv3_3 = self.conv_bn_layer(
+            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
+        # pool3
+        pool3 = tf.nn.max_pool(
+            conv3_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool3')
+        # conv4
+        conv4_1 = self.conv_bn_layer(
+            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
+        conv4_2 = self.conv_bn_layer(
+            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv4_3 = self.conv_bn_layer(
+            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool4
+        pool4 = tf.nn.max_pool(
+            conv4_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # conv5
+        conv5_1 = self.conv_bn_layer(
+            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_2 = self.conv_bn_layer(
+            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_3 = self.conv_bn_layer(
+            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool5
+        pool5 = tf.nn.max_pool(
+            conv5_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # flatten
+        shape = int(np.prod(pool5.get_shape()[1:]))
+        pool5_flat = tf.reshape(pool5, [-1, shape])
+        # fc1
+        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
+        fc1 = self.fc_layer('fc1', drop, [shape, 512])
+        # fc2
+        bn = self.batch_norm_relu(fc1, is_training)
+        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
+        fc2 = self.fc_layer('fc2', drop, [512, 512])
+
+        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
+
+        return fc3
+
+
+def run_benchmark():
+    """Run benchmark on cifar10 or flowers."""
+
+    if args.data_set == "cifar10":
+        class_dim = 10
+        raw_shape = (3, 32, 32)
+        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
+            None, 3, 32, 32)
+    else:
+        class_dim = 102
+        raw_shape = (3, 224, 224)
+        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
+            None, 3, 224, 224)
+
+    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
+
+    with tf.device(device):
+        images = tf.placeholder(tf.float32, shape=dat_shape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+
+        vgg16 = VGG16Model()
+        logits = vgg16.network(images, class_dim, is_training)
+        loss = tf.losses.softmax_cross_entropy(
+            onehot_labels=onehot_labels, logits=logits)
+        avg_loss = tf.reduce_mean(loss)
+
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_loss)
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+
+    # test
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+         map(lambda x: np.transpose(x[0].reshape(raw_shape),
+         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        return np.mean(test_accs)
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+        iters, num_samples, start_time = 0, 0, time.time()
+        for pass_id in range(args.num_passes):
+            # train
+            num_samples = 0
+            start_time = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                if iters == args.skip_batch_num:
+                    start_time = time.time()
+                    num_samples = 0
+                if iters == args.iterations:
+                    break
+                train_images = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
+                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+                train_labels = np.array(map(lambda x: x[1], data)).astype(
+                    'int64')
+                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
+                                        feed_dict={
+                                            images: train_images,
+                                            labels: train_labels,
+                                            is_training: True
+                                        })
+                iters += 1
+                num_samples += len(data)
+                print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
+                      (pass_id, iters, loss, acc))
+            train_elapsed = time.time() - start_time
+            # test
+            pass_test_acc = test()
+            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
+                  (pass_id, num_samples / train_elapsed, pass_test_acc))
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    print_arguments()
+    run_benchmark()
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
+.timestamp
 *.o
 *.a
 .svn

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -93,6 +93,43 @@ static void CheckTensorNANOrInf(const std::string& name,
                 "Tensor %s contains NAN", name);
 }

+void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
+                               int block_id) {
+  auto& global_block = pdesc.Block(block_id);
+
+  const Scope* ancestor_scope = scope;
+  while (ancestor_scope->parent()) {
+    ancestor_scope = ancestor_scope->parent();
+  }
+
+  if (ancestor_scope != scope) {
+    for (auto& var : global_block.AllVars()) {
+      if (var->Name() == framework::kEmptyVarName) {
+        continue;
+      }
+
+      if (var->Persistable()) {
+        auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " global, which pointer is " << ptr;
+      } else {
+        auto* ptr = scope->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " locally, which pointer is " << ptr;
+      }
+    }
+  } else {
+    for (auto& var : global_block.AllVars()) {
+      auto* ptr = scope->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+              << ptr;
+    }
+  }
+}
+
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                   bool create_local_scope, bool create_vars) {
  platform::RecordBlock b(block_id);
@@ -184,8 +221,8 @@ static bool has_fetch_operators(
 void Executor::Run(const ProgramDesc& program, Scope* scope,
                   std::map<std::string, const LoDTensor*>& feed_targets,
                   std::map<std::string, LoDTensor*>& fetch_targets,
-                   const std::string& feed_holder_name,
-                   const std::string& fetch_holder_name, bool create_vars) {
+                   bool create_vars, const std::string& feed_holder_name,
+                   const std::string& fetch_holder_name) {
  platform::RecordBlock b(kProgramId);
  bool has_feed_ops =
      has_feed_operators(program.Block(0), feed_targets, feed_holder_name);
@@ -296,38 +333,13 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(

 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                  bool create_local_scope, bool create_vars) {
-  auto& block = ctx->prog_.Block(ctx->block_id_);
-
  Scope* local_scope = scope;
  if (create_vars) {
    if (create_local_scope) {
      local_scope = &scope->NewScope();
-      for (auto& var : block.AllVars()) {
-        if (var->Name() == framework::kEmptyVarName) {
-          continue;
-        }
-
-        if (var->Persistable()) {
-          auto* ptr = scope->Var(var->Name());
-          InitializeVariable(ptr, var->GetType());
-          VLOG(3) << "Create Variable " << var->Name()
-                  << " global, which pointer is " << ptr;
-        } else {
-          auto* ptr = local_scope->Var(var->Name());
-          InitializeVariable(ptr, var->GetType());
-          VLOG(3) << "Create Variable " << var->Name()
-                  << " locally, which pointer is " << ptr;
-        }
-      }
-    } else {
-      for (auto& var : block.AllVars()) {
-        auto* ptr = local_scope->Var(var->Name());
-        InitializeVariable(ptr, var->GetType());
-        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-                << ptr;
-      }
-    }  // if (create_local_scope)
-  }    // if (create_vars)
+    }
+    CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
+  }

  for (auto& op : ctx->ops_) {
    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -54,9 +54,9 @@ class Executor {
  void Run(const ProgramDesc& program, Scope* scope,
           std::map<std::string, const LoDTensor*>& feed_targets,
           std::map<std::string, LoDTensor*>& fetch_targets,
+           bool create_vars = true,
           const std::string& feed_holder_name = "feed",
-           const std::string& fetch_holder_name = "fetch",
-           bool create_vars = true);
+           const std::string& fetch_holder_name = "fetch");

  static std::unique_ptr<ExecutorPrepareContext> Prepare(
      const ProgramDesc& program, int block_id);
@@ -64,6 +64,8 @@ class Executor {
  static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
      const ProgramDesc& program, const std::vector<int>& block_ids);

+  void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
+
  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                          bool create_local_scope = true,
                          bool create_vars = true);

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/parallel_executor.h"
-#include "paddle/fluid/platform/profiler.h"

 #include <string>
 #include <vector>
@@ -24,6 +23,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace framework {
@@ -43,30 +43,40 @@ class ParallelExecutorPrivate {
 #endif
 };

+std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
+  return member_->local_scopes_;
+}
+
 ParallelExecutor::ParallelExecutor(
    size_t num_threads, bool use_event,
    const std::vector<platform::Place> &places,
    const std::unordered_set<std::string> &params,
-    const ProgramDesc &startup_program, const ProgramDesc &main_program,
-    const std::string &loss_var_name, Scope *scope, bool allow_op_delay)
+    const std::unordered_set<std::string> &bcast_vars,
+    const ProgramDesc &main_program, const std::string &loss_var_name,
+    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;

-  // Step 1. RunStartupProgram and Bcast the params to devs.
-  Executor exe(places[0]);
-  exe.Run(startup_program, scope, 0);
+  // Step 1. Bcast the params to devs.
  // Create local scopes
-  for (size_t i = 0; i < member_->places_.size(); ++i) {
-    member_->local_scopes_.push_back(&scope->NewScope());
+  if (local_scopes.empty()) {
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.push_back(&scope->NewScope());
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.push_back(local_scopes[i]);
+    }
  }

 // Bcast Parameters to all GPUs
 #ifdef PADDLE_WITH_CUDA
  member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
 #endif
-  if (platform::is_gpu_place(places[0]) &&
-      member_->local_scopes_.size() != 1) {  // Is CUDA
-    BCastParamsToGPUs(startup_program);
+  if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
+      local_scopes.empty()) {  // Is CUDA
+    BCastParamsToGPUs(bcast_vars);
  }
 // Startup Program has been run. All local scopes has correct parameters.

@@ -99,48 +109,47 @@ ParallelExecutor::ParallelExecutor(
 }

 void ParallelExecutor::BCastParamsToGPUs(
-    const ProgramDesc &startup_program) const {
+    const std::unordered_set<std::string> &vars) const {
 #ifdef PADDLE_WITH_CUDA
  auto *main_scope = member_->local_scopes_[0];

-  for (auto *var_desc : startup_program.Block(0).AllVars()) {
-    size_t idx = var_desc->Name().find("@GRAD");
-    if (idx != std::string::npos) continue;
-    if (var_desc->GetType() == proto::VarType::LOD_TENSOR) {
-      auto &main_tensor =
-          main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
-
-      auto &dims = main_tensor.dims();
-
-      if (paddle::platform::is_gpu_place(main_tensor.place())) {
-        size_t numel = main_tensor.numel();
-        ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-        platform::NCCLGroupGuard guard;
-        for (size_t i = 0; i < member_->places_.size(); ++i) {
-          auto place = member_->places_[i];
-          void *buffer;
-          if (i == 0) {
-            buffer = const_cast<void *>(main_tensor.data<void>());
-          } else {
-            auto local_scope = member_->local_scopes_[i];
-            auto *t =
-                local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
-            t->Resize(dims);
-            buffer = t->mutable_data(place, main_tensor.type());
-          }
-          auto &nccl_ctx = member_->nccl_ctxs_->at(place);
-          platform::dynload::ncclBcast(buffer, numel, data_type, 0,
-                                       nccl_ctx.comm_, nccl_ctx.stream());
-        }
-      } else {
-        platform::CPUPlace cpu;
-        for (size_t i = 1; i < member_->places_.size(); ++i) {
+  for (auto &var : vars) {
+    auto *main_var = main_scope->FindVar(var);
+    if (!main_var->IsType<LoDTensor>()) {
+      continue;
+    }
+
+    auto &main_tensor = main_var->Get<LoDTensor>();
+
+    auto &dims = main_tensor.dims();
+
+    if (paddle::platform::is_gpu_place(main_tensor.place())) {
+      size_t numel = main_tensor.numel();
+      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
+      platform::NCCLGroupGuard guard;
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        auto place = member_->places_[i];
+        void *buffer;
+        if (i == 0) {
+          buffer = const_cast<void *>(main_tensor.data<void>());
+        } else {
          auto local_scope = member_->local_scopes_[i];
-          auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
+          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
          t->Resize(dims);
-          t->mutable_data(cpu, main_tensor.type());
-          paddle::framework::TensorCopy(main_tensor, cpu, t);
+          buffer = t->mutable_data(place, main_tensor.type());
        }
+        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
+        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
+                                     nccl_ctx.comm_, nccl_ctx.stream());
+      }
+    } else {
+      platform::CPUPlace cpu;
+      for (size_t i = 1; i < member_->places_.size(); ++i) {
+        auto local_scope = member_->local_scopes_[i];
+        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
+        t->Resize(dims);
+        t->mutable_data(cpu, main_tensor.type());
+        paddle::framework::TensorCopy(main_tensor, cpu, t);
      }
    }
    member_->nccl_ctxs_->WaitAll();

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -36,11 +36,14 @@ class ParallelExecutor {
  explicit ParallelExecutor(size_t num_threads, bool use_event,
                            const std::vector<platform::Place>& places,
                            const std::unordered_set<std::string>& params,
-                            const ProgramDesc& startup_program,
+                            const std::unordered_set<std::string>& bcast_vars,
                            const ProgramDesc& main_program,
                            const std::string& loss_var_name, Scope* scope,
+                            const std::vector<Scope*>& local_scopes,
                            bool allow_op_delay);

+  std::vector<Scope*>& GetLocalScopes();
+
  void Run(const std::vector<std::string>& fetch_tensors,
           const std::string& fetched_var_name,
           const std::unordered_map<std::string, LoDTensor>& feed_tensors);
@@ -51,7 +54,7 @@ class ParallelExecutor {

  ParallelExecutorPrivate* member_;

-  void BCastParamsToGPUs(const ProgramDesc& startup_program) const;
+  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
 };

 }  // namespace framework

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -58,7 +58,7 @@ class Scope {
  /// nullptr if cannot find.
  Variable* FindVar(const std::string& name) const;

-  const Scope& parent() const { return *parent_; }
+  const Scope* parent() const { return parent_; }

  /// Find the scope or an ancestor scope that contains the given variable.
  const Scope* FindScope(const Variable* var) const;

--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -46,8 +46,8 @@ TEST(inference, image_classification) {

  // Run inference on CPU
  LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
-                                            FLAGS_repeat);
+  TestInference<paddle::platform::CPUPlace, false>(dirname, cpu_feeds,
+                                                   cpu_fetchs1, FLAGS_repeat);
  LOG(INFO) << output1.dims();

 #ifdef PADDLE_WITH_CUDA
@@ -57,8 +57,8 @@ TEST(inference, image_classification) {

  // Run inference on CUDA GPU
  LOG(INFO) << "--- GPU Runs: ---";
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
-                                             FLAGS_repeat);
+  TestInference<paddle::platform::CUDAPlace, false>(dirname, cpu_feeds,
+                                                    cpu_fetchs2, FLAGS_repeat);
  LOG(INFO) << output2.dims();

  CheckError<float>(output1, output2);

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -88,7 +88,7 @@ void CheckError(const paddle::framework::LoDTensor& output1,
  EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
 }

-template <typename Place>
+template <typename Place, bool CreateVars = true>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
@@ -166,8 +166,16 @@ void TestInference(const std::string& dirname,

  // 6. Run the inference program
  {
+    if (!CreateVars) {
+      // If users don't want to create and destroy variables every time they
+      // run, they need to set `create_vars` to false and manually call
+      // `CreateVariables` before running.
+      executor.CreateVariables(*inference_program, scope, 0);
+    }
+
    // Ignore the profiling results of the first run
-    executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    executor.Run(*inference_program, scope, feed_targets, fetch_targets,
+                 CreateVars);

    // Enable the profiler
    paddle::platform::EnableProfiler(state);
@@ -178,7 +186,8 @@ void TestInference(const std::string& dirname,
          "run_inference",
          paddle::platform::DeviceContextPool::Instance().Get(place));

-      executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+      executor.Run(*inference_program, scope, feed_targets, fetch_targets,
+                   CreateVars);
    }

    // Disable the profiler and print the timing information

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"

--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/adagrad_op.h"
+#include <vector>

 #include <cmath>


--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <string>
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"

--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -13,6 +13,8 @@
 // limitations under the License.

 #include "paddle/fluid/operators/assign_value_op.h"
+#include <string>
+#include <vector>

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"

--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/auc_op.h"
+#include <string>

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"

@@ -40,7 +42,7 @@ class AucKernel : public framework::OpKernel<T> {
    std::vector<float> thresholds_list;
    thresholds_list.reserve(num_thresholds);
    for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = (float)i / (num_thresholds - 1);
+      thresholds_list[i] = static_cast<float>(i) / (num_thresholds - 1);
    }
    const float kEpsilon = 1e-7;
    thresholds_list[0] = 0.0f - kEpsilon;
@@ -105,11 +107,12 @@ class AucKernel : public framework::OpKernel<T> {
    float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
    float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
    for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] =
-          ((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon);
-      fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon);
-      rec_rate_data[i] =
-          ((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon);
+      tp_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+                        (tp_data[i] + fn_data[i] + epsilon);
+      fp_rate_data[i] =
+          static_cast<float>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
+      rec_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+                         (tp_data[i] + fp_data[i] + epsilon);
    }
    *auc_data = 0.0f;
    if (curve == "ROC") {

--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -19,15 +19,15 @@ namespace operators {

 template <>
 void GetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t& num_updates_,
-    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t* num_updates_,
+    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");

-  old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
-  num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
-  num_updates_ = in_num_updates->data<int64_t>()[0];
+  *old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
+  *num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
+  *num_updates_ = in_num_updates->data<int64_t>()[0];
 }

 template <>

--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -19,18 +19,18 @@ namespace paddle {
 namespace operators {
 template <>
 void GetAccumulators<paddle::platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t& num_updates_,
-    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t* num_updates_,
+    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
  auto stream = ctx.cuda_device_context().stream();
-  memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
+  memory::Copy(platform::CPUPlace(), old_num_accumulates_,
               platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
               sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
+  memory::Copy(platform::CPUPlace(), num_accumulates_, platform::CUDAPlace(),
               in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
+  memory::Copy(platform::CPUPlace(), num_updates_, platform::CUDAPlace(),
               in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
 }


--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -29,8 +29,8 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;

 template <typename DeviceContext>
 void GetAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t& num_updates, int64_t& num_accumulates,
-                     int64_t& old_num_accumulates);
+                     int64_t* num_updates, int64_t* num_accumulates,
+                     int64_t* old_num_accumulates);

 template <typename DeviceContext>
 void SetAccumulators(const framework::ExecutionContext& ctx,
@@ -47,8 +47,8 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
    int64_t num_updates = 0;
    int64_t num_accumulates = 0;
    int64_t old_num_accumulates = 0;
-    GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
-                                   old_num_accumulates);
+    GetAccumulators<DeviceContext>(ctx, &num_updates, &num_accumulates,
+                                   &old_num_accumulates);

    // Get attrs
    float average_window = ctx.Attr<float>("average_window");

--- a/paddle/fluid/operators/go_op.cc
+++ b/paddle/fluid/operators/go_op.cc
@@ -56,11 +56,11 @@ class GoOp : public framework::OperatorBase {

    // TODO(varunarora): Consider moving this root scope lookup to scope.h.
    const framework::Scope *root_scope = &scope;
-    const framework::Scope *parent_scope = &(root_scope->parent());
+    const framework::Scope *parent_scope = root_scope->parent();

    while (parent_scope != nullptr) {
      root_scope = parent_scope;
-      parent_scope = &(parent_scope->parent());
+      parent_scope = parent_scope->parent();
    }

    framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock);

--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/spp_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"

--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/sum_op.h"
+#include <algorithm>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"

--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <iostream>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"


--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/transpose_op.h"
+#include <vector>

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"


--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/unpool_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
@@ -14,6 +14,8 @@ limitations under the License. */

 #pragma once

+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/unpooling.h"

--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"

--- a/paddle/fluid/platform/call_once.h
+++ b/paddle/fluid/platform/call_once.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>
-
-namespace paddle {
-namespace platform {
-
-/*
- The current implementation of std::call_once has a bug described in
- https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call.
- This is likely caused by a deeper bug of pthread_once, which is discussed in
- https://patchwork.ozlabs.org/patch/482350/
-
- This wrap is a hack to avoid this bug.
-*/
-template <typename Callable, typename... Args>
-inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
-  bool good = true;
-  std::exception ex;
-  try {
-    std::call_once(flag,
-                   [&](Args&&... args) {
-                     try {
-                       f(args...);
-                     } catch (const std::exception& e) {
-                       ex = e;
-                       good = false;
-                     } catch (...) {
-                       ex = std::runtime_error("excption caught in call_once");
-                       good = false;
-                     }
-                   },
-                   args...);
-  } catch (std::system_error& x) {
-    throw std::runtime_error("call once failed");
-  }
-  if (!good) {
-    throw std::exception(ex);
-  }
-}
-
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -18,7 +18,6 @@ limitations under the License. */

 #include <mutex>  // NOLINT

-#include "paddle/fluid/platform/call_once.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"

 namespace paddle {

--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/platform/float16.h"
+
+#include <vector>
+
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"

-#include <gtest/gtest.h>
-
 namespace paddle {
 namespace platform {

@@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) {

  // Conversion operator
  EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
-  EXPECT_EQ(float(float16(0.5f)), 0.5f);
-  EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001);
-  EXPECT_EQ(int(float16(-1)), -1);
-  EXPECT_EQ(bool(float16(true)), true);
+  EXPECT_EQ(static_cast<float>(float16(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(float16(0.33333)), 0.33333, 0.0001);
+  EXPECT_EQ(static_cast<int>(float16(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(float16(true)), true);
 }

 TEST(float16, arithmetic_cpu) {
-  EXPECT_EQ(float(float16(1) + float16(1)), 2);
-  EXPECT_EQ(float(float16(5) + float16(-5)), 0);
-  EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001);
-  EXPECT_EQ(float(float16(3) - float16(5)), -2);
-  EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001);
-  EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
-  EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
-  EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001);
-  EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f);
-  EXPECT_EQ(float(-float16(512.0f)), -512.0f);
-  EXPECT_EQ(float(-float16(-512.0f)), 512.0f);
+  EXPECT_EQ(static_cast<float>(float16(1) + float16(1)), 2);
+  EXPECT_EQ(static_cast<float>(float16(5) + float16(-5)), 0);
+  EXPECT_NEAR(static_cast<float>(float16(0.33333f) + float16(0.66667f)), 1.0f,
+              0.001);
+  EXPECT_EQ(static_cast<float>(float16(3) - float16(5)), -2);
+  EXPECT_NEAR(static_cast<float>(float16(0.66667f) - float16(0.33333f)),
+              0.33334f, 0.001);
+  EXPECT_NEAR(static_cast<float>(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(static_cast<float>(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
+  EXPECT_NEAR(static_cast<float>(float16(2.0f) / float16(3.0f)), 0.66667f,
+              0.001);
+  EXPECT_EQ(static_cast<float>(float16(1.0f) / float16(2.0f)), 0.5f);
+  EXPECT_EQ(static_cast<float>(-float16(512.0f)), -512.0f);
+  EXPECT_EQ(static_cast<float>(-float16(-512.0f)), 512.0f);
 }

 TEST(float16, comparison_cpu) {

--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -36,19 +36,19 @@ limitations under the License. */
    half *in1, *in2, *out;                                    \
    half *d_in1, *d_in2, *d_out;                              \
    int size = sizeof(half);                                  \
-    cudaMalloc((void**)&d_in1, size);                         \
-    cudaMalloc((void**)&d_in2, size);                         \
-    cudaMalloc((void**)&d_out, size);                         \
-    in1 = (half*)malloc(size);                                \
-    in2 = (half*)malloc(size);                                \
-    out = (half*)malloc(size);                                \
+    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
+    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
+    cudaMalloc(reinterpret_cast<void**>(&d_out), size);       \
+    in1 = reinterpret_cast<half*>(malloc(size));              \
+    in2 = reinterpret_cast<half*>(malloc(size));              \
+    out = reinterpret_cast<half*>(malloc(size));              \
    in1[0] = half(float16(v_in1));                            \
    in2[0] = half(float16(v_in2));                            \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(float(float16(out[0])), v_out);                 \
+    EXPECT_EQ(static_cast<float>(float16(out[0])), v_out);    \
    free(in1);                                                \
    free(in2);                                                \
    free(out);                                                \
@@ -63,17 +63,17 @@ limitations under the License. */
    half *in1, *in2;                                          \
    half *d_in1, *d_in2;                                      \
    int size = sizeof(half);                                  \
-    cudaMalloc((void**)&d_in1, size);                         \
-    cudaMalloc((void**)&d_in2, size);                         \
-    in1 = (half*)malloc(size);                                \
-    in2 = (half*)malloc(size);                                \
+    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
+    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
+    in1 = reinterpret_cast<half*>(malloc(size));              \
+    in2 = reinterpret_cast<half*>(malloc(size));              \
    in1[0] = half(float16(v_in1));                            \
    in2[0] = half(float16(v_in2));                            \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
    op_type<<<1, 1>>>(d_in1, d_in2);                          \
    cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(float(float16(in1[0])), v_out);                 \
+    EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out);    \
    free(in1);                                                \
    free(in2);                                                \
    cudaFree(d_in1);                                          \
@@ -87,12 +87,12 @@ limitations under the License. */
    half *d_in1, *d_in2;                                     \
    bool *out, *d_out;                                       \
    int size = sizeof(half);                                 \
-    cudaMalloc((void**)&d_in1, size);                        \
-    cudaMalloc((void**)&d_in2, size);                        \
-    cudaMalloc((void**)&d_out, 1);                           \
-    in1 = (half*)malloc(size);                               \
-    in2 = (half*)malloc(size);                               \
-    out = (bool*)malloc(1);                                  \
+    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);      \
+    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);      \
+    cudaMalloc(reinterpret_cast<void**>(&d_out), 1);         \
+    in1 = reinterpret_cast<half*>(malloc(size));             \
+    in2 = reinterpret_cast<half*>(malloc(size));             \
+    out = reinterpret_cast<bool*>(malloc(1));                \
    in1[0] = half(float16(v_in1));                           \
    in2[0] = half(float16(v_in2));                           \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
@@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) {
  LOG(INFO) << "Test Neg on GPU!";
  half *in, *d_in;
  int size = sizeof(half);
-  cudaMalloc((void**)&d_in, size);
-  in = (half*)malloc(size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in), size);
+  in = reinterpret_cast<half*>(malloc(size));
  in[0] = half(float16(v_in));
  cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
  Neg<<<1, 1>>>(d_in);
  cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
-  EXPECT_EQ(float(float16(in[0])), v_out);
+  EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
  free(in);
  cudaFree(d_in);
 }

--- a/paddle/fluid/pybind/.gitignore
+++ b/paddle/fluid/pybind/.gitignore
+pybind.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -544,13 +544,20 @@ All parameter, weight, gradient are variables in Paddle.
           [](ParallelExecutor &self, size_t num_threads, bool use_event,
              const std::vector<platform::Place> &places,
              const std::unordered_set<std::string> &params,
-              const ProgramDesc &startup_program,
+              const std::unordered_set<std::string> &bcast_vars,
              const ProgramDesc &main_program, const std::string &loss_var_name,
-              Scope *scope, bool allow_op_delay) {
-             new (&self) ParallelExecutor(num_threads, use_event, places,
-                                          params, startup_program, main_program,
-                                          loss_var_name, scope, allow_op_delay);
+              Scope *scope, std::vector<Scope *> &local_scopes,
+              bool allow_op_delay) {
+             new (&self)
+                 ParallelExecutor(num_threads, use_event, places, params,
+                                  bcast_vars, main_program, loss_var_name,
+                                  scope, local_scopes, allow_op_delay);
           })
+      .def("local_scopes",
+           [](ParallelExecutor &self) -> std::vector<Scope *> * {
+             return &self.GetLocalScopes();
+           },
+           py::return_value_policy::reference)
      .def("run", &ParallelExecutor::Run);

  BindRecordIOWriter(&m);

--- a/python/.gitignore
+++ b/python/.gitignore
 *pyc
 build
 dist
+paddlepaddle.egg-info
 paddle.egg-info
 paddlepaddle_gpu.egg-info
 .idea

--- a/python/paddle/.gitignore
+++ b/python/paddle/.gitignore
+version.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -659,7 +659,7 @@ class Block(object):
    def __init__(self, program, idx):
        self.desc = program.desc.block(idx)
        self.vars = dict()  # var_name --> var
-        self.ops = collections.deque()  # operator list
+        self.ops = list()  # operator list
        self.program = program
        self.removed_vars = dict()

@@ -831,6 +831,13 @@ class Block(object):
        self.ops.append(op)
        return op

+    def insert_op(self, index, *args, **kwargs):
+        self.sync_with_cpp()
+        op_desc = self.desc.insert_op(index)
+        op = Operator(block=self, desc=op_desc, *args, **kwargs)
+        self.ops.insert(index, op)
+        return op
+
    def delete_ops(self, ops):
        # remove from cpp
        # FIXME(typhoonzero): remove only the first occurrence.
@@ -842,12 +849,12 @@ class Block(object):
        self.desc.remove_op(start, end + 1)

    def slice_ops(self, start, end):
-        return list(self.ops)[start:end]
+        return self.ops[start:end]

    def prepend_op(self, *args, **kwargs):
        op_desc = self.desc.prepend_op()
        op = Operator(self, op_desc, *args, **kwargs)
-        self.ops.appendleft(op)
+        self.ops.insert(0, op)
        return op

    def sync_with_cpp(self):
@@ -892,7 +899,7 @@ class Block(object):
        for index in range((start_index - 1 - 1), -1, -1):
            op_desc = ops_in_cpp[index]
            op = Operator(self, op_desc)
-            self.ops.appendleft(op)
+            self.ops.insert(0, op)

        # sync ops append to the end of cpp_ops
        for index in range((end_index + 1), len(ops_in_cpp)):

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -22,10 +22,49 @@ __all__ = ['ParallelExecutor']

 class ParallelExecutor(object):
    def __init__(self,
-                 loss_name,
                 use_cuda,
+                 loss_name=None,
+                 main_program=None,
                 num_threads=None,
-                 allow_op_delay=False):
+                 allow_op_delay=False,
+                 share_vars_from=None):
+        """
+        ParallelExecutor can run program in parallel.
+
+        Args:
+            use_cuda(bool): Whether to use CUDA or not.
+            loss_name(str, default None): The loss name must set in training.
+            main_program(Program, default None): The program that need to run,
+                if not provided, then default_main_program will be used.
+            num_threads(int, default None): How many threads are used for
+                training.
+            allow_op_delay(bool, default False): Whether to delay and buffer
+                some operators together for scheduling or not, which may
+                improve performance in some cases, defalut False.
+            share_vars_from(ParallelExecutor, default None): If provied,
+                it will share variables from the specified ParallelExecutor.
+
+        Returns:
+            A ParallelExecutor object.
+
+        Raises:
+            TypeError: If share_vars_from is provided, but not ParallelExecutor
+                object.
+
+        Examples:
+            .. code-block:: python
+
+              train_exe = fluid.ParallelExecutor(
+                  use_cuda=True, loss_name=loss.name)
+              test_exe = fluid.ParallelExecutor(
+                  use_cuda=True,
+                  main_program=test_program,
+                  share_vars_from=train_exe)
+
+              train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
+              test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+        """
+
        self._places = []
        self._act_places = []
        if use_cuda:
@@ -50,10 +89,21 @@ class ParallelExecutor(object):
            else:
                min(len(self._places) * 2, multiprocessing.cpu_count())

-        startup = framework.default_startup_program()
-        main = framework.default_main_program()
+        main = main_program
+        main = main if main else framework.default_main_program()
        scope = executor.global_scope()

+        if share_vars_from and not isinstance(share_vars_from,
+                                              ParallelExecutor):
+            raise TypeError("share_vars_from must be ParallelExecutor.")
+        local_scopes = share_vars_from.executor.local_scopes(
+        ) if share_vars_from else []
+
+        persistable_vars = [
+            v.name
+            for v in filter(lambda var: var.persistable, main.list_vars())
+        ]
+
        self.executor = core.ParallelExecutor(
            num_threads,
            True if use_cuda else False,  # use_event
@@ -62,10 +112,11 @@ class ParallelExecutor(object):
                p.name for p in main.global_block().iter_parameters()
                if not p.stop_gradient
            ]),
-            startup.desc,
+            set(persistable_vars),
            main.desc,
-            loss_name,
+            loss_name if loss_name else '',
            scope,
+            local_scopes,
            allow_op_delay)
        self.scope = scope


--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -207,7 +207,11 @@ class TestParallelExecutorBase(unittest.TestCase):
            if memory_opt:
                fluid.memory_optimize(main)

-            exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True)
+            place = fluid.CUDAPlace(0)
+            startup_exe = fluid.Executor(place)
+            startup_exe.run(startup)
+
+            exe = fluid.ParallelExecutor(True, loss_name=loss.name)
            if batch_size is not None:
                batch_size *= fluid.core.get_cuda_device_count()
            begin = time.time()
@@ -453,3 +457,41 @@ class TestTransformer(TestParallelExecutorBase):
    @unittest.skip("transformer is buggy in multi gpu")
    def test_main(self):
        self.check_network_convergence(transformer)
+
+
+class ParallelExecutorTestingDuringTraining(unittest.TestCase):
+    def test_parallel_testing(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = simple_fc_net(True)
+            test_program = main.clone(for_test=True)
+
+            opt = fluid.optimizer.SGD(learning_rate=0.0001)
+            opt.minimize(loss)
+
+            batch_size = 32
+            image = numpy.random.normal(size=(batch_size,
+                                              784)).astype('float32')
+            label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64")
+
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup)
+            feed_dict = {'image': image, 'label': label}
+
+            train_exe = fluid.ParallelExecutor(
+                use_cuda=True, loss_name=loss.name, main_program=main)
+
+            test_exe = fluid.ParallelExecutor(
+                use_cuda=True,
+                main_program=test_program,
+                share_vars_from=train_exe)
+
+            for i in xrange(5):
+                test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+                test_loss = numpy.array(test_loss)
+
+                train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
+                train_loss = numpy.array(train_loss)
+                self.assertTrue(numpy.allclose(train_loss, test_loss))