Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fluid_benchmark_support_recordioreader

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fluid_benchmark_support_recordioreader
a28a462e · yi.wu · 47630a4a · 23812490 · a28a462e · a28a462e
90 changed file
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl
 caffe/image/logs
 tensorflow/image/logs
 tensorflow/rnn/logs
+fluid/models/*.pyc
+fluid/logs
+fluid/nohup.out
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -40,10 +40,7 @@ def parse_args():
    parser.add_argument(
        '--batch_size', type=int, default=32, help='The minibatch size.')
    parser.add_argument(
-        '--learning_rate',
-        type=float,
-        default=0.001,
-        help='The minibatch size.')
+        '--learning_rate', type=float, default=0.001, help='The learning rate.')
    parser.add_argument(
        '--skip_batch_num',
        type=int,
@@ -51,7 +48,10 @@ def parse_args():
        help='The first num of minibatch num to skip, for better performance test'
    )
    parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
+        '--iterations',
+        type=int,
+        default=80,
+        help='The number of minibatches, set to -1 to run all batches.')
    parser.add_argument(
        '--pass_num', type=int, default=100, help='The number of passes.')
    parser.add_argument(
@@ -71,11 +71,12 @@ def parse_args():
        type=int,
        default=1,
        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    # this option is available only for vgg and resnet.
    parser.add_argument(
        '--data_set',
        type=str,
        default='flowers',
-        choices=['cifar10', 'flowers'],
+        choices=['cifar10', 'flowers', 'imagenet'],
        help='Optional dataset for benchmark.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
@@ -228,12 +229,18 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
+        reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+            if iters == args.iterations or data == None:
+                break
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
-            if iters == args.iterations:
-                break
+
            if args.use_reader_op:
                loss = exe.run(train_prog, fetch_list=[avg_loss])
            else:
@@ -241,14 +248,13 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
                               feed=feeder.feed(data),
                               fetch_list=[avg_loss])
            iters += 1
-            num_samples += len(data)
+            batch_id += 1
+            # FIXME(wuyi): last batch size maybe different
+            num_samples += len(args.batch_size)
            train_losses.append(loss)
            print("Pass: %d, Iter: %d, Loss: %f\n" %
                  (pass_id, iters, np.mean(train_losses)))
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sec\n' %
-              (num_samples, train_elapsed, examples_per_sec))
+        print_train_time(start_time, time.time(), num_samples)
        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses)))
        # evaluation
        if not args.no_test and batch_acc != None:
@@ -309,7 +315,14 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
        num_samples = 0
        iters = 0
        start_time = time.time()
-        for batch_id, data in enumerate(train_reader()):
+        reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+            if iters == args.iterations or data == None:
+                break
            if args.profile and pass_id == 0 and batch_id == 5:
                profiler.start_profiler("All")
            elif args.profile and pass_id == 0 and batch_id == 10:
@@ -318,8 +331,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
-            if iters == args.iterations:
-                break
            # NOTE: if use reader ops, the input data is not splited to multiple cards
            if args.use_reader_op and iters >= args.iterations / args.gpus:
                break
@@ -334,12 +345,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
            if batch_id % 1 == 0:
                print("Pass %d, batch %d, loss %s" %
                      (pass_id, batch_id, np.array(loss)))
-        train_elapsed = time.time() - start_time
+            batch_id += 1
        if args.use_reader_op:
            num_samples = num_samples * args.gpus
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
+        print_train_time(start_time, time.time(), num_samples)
        if not args.no_test and batch_acc != None:
            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
                            batch_acc)
@@ -350,12 +359,19 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
 def print_arguments(args):
    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
                                vars(args)['device'] == 'GPU')
-    print('----------- resnet Configuration Arguments -----------')
+    print('----------- Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')


+def print_train_time(start_time, end_time, num_samples):
+    train_elapsed = end_time - start_time
+    examples_per_sec = num_samples / train_elapsed
+    print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+          (num_samples, train_elapsed, examples_per_sec))
+
+
 def main():
    args = parse_args()
    print_arguments(args)

--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -27,6 +27,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
+from recordio_converter import imagenet_train, imagenet_test


 def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
@@ -123,13 +124,30 @@ def get_model(args):
        else:
            dshape = [32, 32, 3]
        model = resnet_cifar10
-    else:
+        train_reader = paddle.dataset.cifar.train10()
+        test_reader = paddle.dataset.cifar.test10()
+    elif args.data_set == "flowers":
        class_dim = 102
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]
        model = resnet_imagenet
+        train_reader = paddle.dataset.flowers.train()
+        test_reader = paddle.dataset.flowers.test()
+    elif args.data_set == "imagenet":
+        class_dim = 1000
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if not args.data_dir:
+            raise Exception(
+                "Must specify --data_dir when training with imagenet")
+        train_reader = imagenet_train(args.data_dir)
+        test_reader = imagenet_test(args.data_dir)
+
    if args.use_reader_op:
        filelist = [
            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
@@ -163,15 +181,10 @@ def get_model(args):

    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)

-    train_reader = paddle.batch(
+    batched_train_reader = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            train_reader, buf_size=5120),
        batch_size=args.batch_size)
+    batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size)

-    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
+    return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc
--- a/benchmark/fluid/recordio_converter.py
+++ b/benchmark/fluid/recordio_converter.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 import os
+import random
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -53,6 +54,13 @@ def prepare_flowers(outpath, batch_size):
                       [1])


+def default_mapper(sample):
+    img, label = sample
+    img = image.simple_transform(
+        img, 256, 224, True, mean=[103.94, 116.78, 123.68])
+    return img.flatten().astype('float32'), label
+
+
 def imagenet_train(data_dir):
    contents = os.listdir(data_dir)
    if set(contents) != set(
@@ -68,6 +76,8 @@ def imagenet_train(data_dir):
            img, lbl = l[:-1].split(" ")
            img2label[img] = int(lbl)
            imgfilelist.append(img)
+    # shuffle all, this is slow
+    random.shuffle(imgfilelist)

    def train_reader():
        for idx, imgfile in enumerate(imgfilelist):
@@ -76,15 +86,36 @@ def imagenet_train(data_dir):
            label = [img2label[imgfile], ]
            yield [data, label]

-    def default_mapper(sample):
-        img, label = sample
-        img = image.simple_transform(
-            img, 256, 224, True, mean=[103.94, 116.78, 123.68])
-        return img.flatten().astype('float32'), label
-
    return paddle.reader.map_readers(default_mapper, train_reader)


+def imagenet_test(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "val.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+
+    def test_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
+            image_path = ".".join([base_path, "jpeg"])
+            data = image.load_image(image_path)
+            label = [img2label[imgfile], ]
+            yield [data, label]
+
+    return paddle.reader.map_readers(default_mapper, test_reader)
+
+
 # FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
 def convert_reader_to_recordio_files(
        filename,

--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@@ -2,6 +2,7 @@
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.

+mkdir -p logs
 #export FLAGS_fraction_of_gpu_memory_to_use=0.0
 export CUDNN_PATH=/paddle/cudnn_v5

@@ -35,6 +36,7 @@ nohup stdbuf -oL nvidia-smi \
      --format=csv \
      --filename=mem.log  \
      -l 1 &
+
 # mnist
 # mnist gpu mnist 128
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
@@ -43,7 +45,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=500 \
-               2>&1 | tee -a mnist_gpu_128.log
+               2>&1 | tee -a logs/mnist_gpu_128.log

 # vgg16
 # gpu cifar10 128
@@ -53,7 +55,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_128.log
+               2>&1 | tee -a logs/vgg16_gpu_128.log

 # flowers gpu  128
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
@@ -63,28 +65,28 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --data_set=flowers \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_flowers_32.log
+               2>&1 | tee -a logs/vgg16_gpu_flowers_32.log

 # resnet50
 # resnet50 gpu cifar10 128
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=resnet50 \
+               --model=resnet \
               --device=GPU \
               --batch_size=128 \
               --data_set=cifar10 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_128.log
+               2>&1 | tee -a logs/resnet50_gpu_128.log

 # resnet50 gpu flowers 64
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=resnet50 \
+               --model=resnet \
               --device=GPU \
               --batch_size=64 \
               --data_set=flowers \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_flowers_64.log
+               2>&1 | tee -a logs/resnet50_gpu_flowers_64.log

 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
@@ -94,7 +96,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --batch_size=32 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a lstm_gpu_32.log
+               2>&1 | tee -a logs/lstm_gpu_32.log

 # seq2seq
 # seq2seq gpu wmb 128
@@ -104,4 +106,4 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a lstm_gpu_128.log
+               2>&1 | tee -a logs/lstm_gpu_128.log
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -33,10 +33,18 @@ ELSE()
  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin)
 ENDIF()

+# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
 ExternalProject_Add(
    extern_grpc
    DEPENDS protobuf zlib
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc.tar.xz"
+    # NOTE(wuyi):
+    # this package is generated by following steps:
+    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
+    # 2. submodule update --init
+    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
+    #    checkout and clean other dirs under third_party
+    # 4. remove .git, and package the directory.
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
@@ -49,7 +57,6 @@ ExternalProject_Add(
    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
 )

-# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
 ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")

--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -59,3 +59,21 @@ get_inference_program
 ..  autofunction:: paddle.fluid.io.get_inference_program
    :noindex:

+save_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.save_checkpoint
+    :noindex:
+
+load_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.load_checkpoint
+    :noindex:
+
+clean_checkpoint
+----------------
+
+..  autofunction:: paddle.fluid.io.clean_checkpoint
+    :noindex:
+
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -181,6 +181,12 @@ Print
 ..  autofunction:: paddle.fluid.layers.Print
    :noindex:

+is_empty
+--------
+
+..  autofunction:: paddle.fluid.layers.is_empty
+    :noindex:
+
 device
 ======

@@ -255,6 +261,19 @@ double_buffer
 ..  autofunction:: paddle.fluid.layers.double_buffer
    :noindex:

+random_data_generator
+---------------------
+
+..  autofunction:: paddle.fluid.layers.random_data_generator
+    :noindex:
+
+Preprocessor
+------------
+
+..  autoclass:: paddle.fluid.layers.Preprocessor
+    :members:
+    :noindex:
+
 nn
 ==

@@ -594,6 +613,29 @@ roi_pool
 ..  autofunction:: paddle.fluid.layers.roi_pool
    :noindex:

+dice_loss
+---------
+
+..  autofunction:: paddle.fluid.layers.dice_loss
+    :noindex:
+
+resize_bilinear
+---------------
+
+..  autofunction:: paddle.fluid.layers.resize_bilinear
+    :noindex:
+
+gather
+------
+
+..  autofunction:: paddle.fluid.layers.gather
+    :noindex:
+
+random_crop
+-----------
+
+..  autofunction:: paddle.fluid.layers.random_crop
+    :noindex:

 ops
 ===
@@ -742,6 +784,12 @@ sum
 ..  autofunction:: paddle.fluid.layers.sum
    :noindex:

+shape
+-----
+
+..  autofunction:: paddle.fluid.layers.shape
+    :noindex:
+
 sigmoid
 -------

@@ -991,27 +1039,3 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
    :noindex:

-topk
----
-
-..  autofunction:: paddle.fluid.layers.topk
-    :noindex:
-
-dice_loss
----
-
-..  autofunction:: paddle.fluid.layers.dice_loss
-    :noindex:
-
-upsampling_bilinear2d
-____
-
-..  autofunction:: paddle.fluid.layers.upsampling_bilinear2d
-    :noindex:
-
-gather
-____
-
-..  autofunction:: paddle.fluid.layers.gather
-    :noindex:
-
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -47,28 +47,6 @@ DecayedAdagrad
    :members:
    :noindex:

-Adadelta
-----------------
-
-..  autoclass:: paddle.fluid.optimizer.Adadelta
-    :members:
-    :noindex:
-
-RMSProp
-----------------
-
-..  autoclass:: paddle.fluid.optimizer.RMSProp
-    :members:
-    :noindex:
-
-ModelAverage
-----------------
-
-..  autoclass:: paddle.fluid.optimizer.ModelAverage
-    :members:
-    :noindex:
-
-
 SGDOptimizer
 ------------

@@ -111,25 +89,31 @@ DecayedAdagradOptimizer
    :members:
    :noindex:

+RMSPropOptimizer
+----------------

-AdadeltaOptimizer
-----------------
-
-..  autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
    :members:
    :noindex:

+Adadelta
+--------

-RMSPropOptimizer
-----------------
+..  autoclass:: paddle.fluid.optimizer.Adadelta
+    :members:
+    :noindex:

-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+ModelAverage
+------------
+
+..  autoclass:: paddle.fluid.optimizer.ModelAverage
    :members:
    :noindex:
-    
+
 Optimizer
 ---------

 ..  autoclass:: paddle.fluid.optimizer.Optimizer
    :members:
    :noindex:
+
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -23,3 +23,15 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:

+start_profiler
+--------------
+
+..  autofunction:: paddle.fluid.profiler.start_profiler
+    :noindex:
+
+stop_profiler
+-------------
+
+..  autofunction:: paddle.fluid.profiler.stop_profiler
+    :noindex:
+
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -35,7 +35,7 @@ The computation `Program` consists of nested `Blocks`. Each `Block` will consist

 ## Definition of VarType

-A VarDesc should have a name, type and whether or not it is persistable. The are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
+A VarDesc should have a name, type and whether or not it is persistable. There are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:

 ```proto
 message VarDesc {

--- a/doc/fluid/howto/cluster/fluid_recordio.md
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
+# How to use RecordIO in Fluid
+
+If you want to use RecordIO as your training data format, you need to convert to your training data
+to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some
+interface to deal with the RecordIO files.
+
+## Generate RecordIO File
+
+Before start training with RecordIO files, you need to convert your training data
+to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes
+as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder)
+```
+
+The above code snippet would generate a RecordIO `./mnist.recordio` on your host.
+
+**NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can
+adjust it flexibly while reading it.
+
+## Use the RecordIO file in a Local Training Job
+
+PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file
+and then you can use them as a Layer in your network configuration, the sample codes as follows:
+
+```python
+    data_file = fluid.layers.io.open_recordio_file(
+        filename="./mnist.recordio",
+        shapes=[(-1, 784),(-1, 1)],
+        lod_levels=[0, 0],
+        dtypes=["float32", "int32"])
+    data_file = fluid.layers.io.batch(data_file, batch_size=4)
+
+    img, label = fluid.layers.io.read_file(data_file)
+    hidden = fluid.layers.fc(input=img, size=100, act='tanh')
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+
+    fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
+
+    place = fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    avg_loss_np = []
+
+    # train a pass
+    batch_id = 0
+    while True:
+        tmp, = exe.run(fetch_list=[avg_loss])
+
+        avg_loss_np.append(tmp)
+        print(batch_id)
+        batch_id += 1
+```
+
+## Use the RecordIO files in Distributed Training
+
+1. generate multiple RecordIO files
+
+For a distributed training job, you may have multiple trainer nodes,
+and one or more RecordIO files for one trainer node, you can use the interface
+`fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data
+into multiple RecordIO files, the sample codes as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_files(
+          filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder)
+```
+
+The above codes would generate multiple RecordIO files on your host like:
+
+```bash
+.
+ \_mnist-00000.recordio
+ |-mnist-00001.recordio
+ |-mnist-00002.recordio
+ |-mnist-00003.recordio
+ |-mnist-00004.recordio
+```
+
+2. open multiple RecordIO files by `fluid.layers.io.open_files`
+
+For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes,
+each trainer process reads parts of the whole training data, we usually take the following approach to make the training
+data allocated by each trainer process as uniform as possiable:
+
+```python
+def gen_train_list(file_pattern, trainers, trainer_id):
+   file_list = glob.glob(file_pattern)
+   ret_list = []
+   for idx, f in enumerate(file_list):
+       if (idx + trainers) % trainers == trainer_id:
+           ret_list.append(f)
+   return ret_list
+
+trainers = int(os.getenv("TRAINERS"))
+trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+data_file = fluid.layers.io.open_files(
+    filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
+    thread_num=1,
+    shapes=[(-1, 784),(-1, 1)],
+    lod_levels=[0, 0],
+    dtypes=["float32", "int32"])
+img, label = fluid.layers.io.read_file(data_files)
+...
+```
--- a/doc/fluid/howto/optimization/benchmark/README.md
+++ b/doc/fluid/howto/optimization/benchmark/README.md
-../../../../../benchmark/cluster/README.md
\ No newline at end of file
--- a/doc/fluid/howto/optimization/benchmark/vgg16/README.md
+++ b/doc/fluid/howto/optimization/benchmark/vgg16/README.md
-../../../../../../benchmark/cluster/vgg16/README.md
\ No newline at end of file
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -63,16 +63,16 @@ Android的Docker开发镜像向用户提供两个可配置的参数：
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库

 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
 ```

 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库

 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
 ```

-执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+执行上述`docker run`命令时，容器执行[paddle/scripts/paddle_build.sh build_android](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。

 ## 基于Linux交叉编译环境的编译方式
 本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。

--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -36,7 +36,7 @@ $ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
 We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:

 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android ./paddle/scripts/paddle_build.sh build_android
 ```

 The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
@@ -70,7 +70,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:

 The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.

-The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
+The build command, [`paddle/scripts/paddle_build.sh build_android`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.

 The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`.


--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -23,7 +23,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__ 找到 paddle_manylinux_devel
 镜像的编译以及使用方法。或者参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。

-如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。

 编译PaddlePaddle，需要执行：

@@ -106,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安

 - 学习 Docker 有多难？

-  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+  理解 Docker 并不难，大概花十分钟看一下 `这篇文章 <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。

 - 我可以用 IDE 吗？

@@ -123,7 +123,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安

 - 可以并行编译吗？

-  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+  是的。我们的 Docker image 运行一个 `Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。

 - Docker 需要 sudo

@@ -131,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安

 - 在 Windows/MacOS 上编译很慢

-  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `这个issue <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。

 - 磁盘不够

-  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `这篇文章 <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。


 .. _compile_deps:
@@ -211,7 +211,7 @@ PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，
 编译选项的设置
 ++++++++++++++

-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如

 ..  code-block:: bash


--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -11,7 +11,7 @@ To build PaddlePaddle, you need
 1. A computer -- Linux, Windows, MacOS.
 2. Docker.

-Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image. 
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.
 We run all the tools by running this image.

 .. _build_step:
@@ -26,6 +26,8 @@ you can also find how to build and use paddle_manylinux_devel Docker image from
 `here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__
 Or you can build your own image from source as the optional step below:

+If you don't wish to use docker，you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation.
+
 .. code-block:: bash

   # 1. clone the source code
@@ -108,7 +110,7 @@ Frequently Asked Questions

 - How difficult is it to learn Docker?

-    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+    It takes you ten minutes to read `an introductory article <https://docs.docker.com/get-started>`_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.

 - Can I use my favorite IDE?

@@ -125,7 +127,7 @@ Frequently Asked Questions

 - Does Docker do parallel building?

-  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+  Our building Docker image runs a  `Bash script <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.

 - Docker requires sudo

@@ -133,11 +135,11 @@ Frequently Asked Questions

 - Docker on Windows/MacOS builds slowly

-  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to `this issue  <https://github.com/PaddlePaddle/Paddle/issues/627>`_ for details.

 - Not enough disk space

-  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
+  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to `this article <https://zaiste.net/posts/removing_docker_containers/>`_ .

 .. _compile_deps:


--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -17,33 +17,77 @@ if(APPLE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)

+set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files")
+set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
+
+
+set(inference_deps paddle_inference_api paddle_fluid_api)
+
+# if anakin is set enable anakin api implementation
+if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
+    set(ANAKIN_FOUND ON)
+else()
+    set(ANAKIN_FOUND OFF)
+endif()
+
+if (ANAKIN_FOUND)
+    # Anakin's code style doesn't follow google c style.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment
+                                            -Wno-error=reorder
+                                            -Wno-error=format
+                                            -Wno-error=switch
+                                            -Wno-error=return-type
+                                            -Wno-error=non-virtual-dtor
+                                            -Wno-error=cpp")
+
+    message(STATUS "Anakin for inference is enabled")
+    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+    include_directories("${ANAKIN_INCLUDE}")
+    # Anakin's source path is a mass, need to set sub-directories trivially.
+    include_directories("${ANAKIN_INCLUDE}/saber")
+    link_directories("${ANAKIN_LIBRARY}")
+
+    nv_library(inference_anakin_api SRCS paddle_inference_api_anakin_engine.cc)
+    target_link_libraries(inference_anakin_api anakin)
+    list(APPEND inference_deps inference_anakin_api)
+endif()
+
+
 function(inference_api_test TARGET_NAME)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs ARGS)
-    cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
-    cc_test(test_paddle_inference_${TARGET_NAME}
-            SRCS test_paddle_inference_${TARGET_NAME}.cc
-            DEPS paddle_fluid_api paddle_inference_api
-            ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
-    if(inference_test_ARGS)
-        set_tests_properties(test_paddle_inference_${TARGET_NAME}
-                 PROPERTIES DEPENDS "${inference_test_ARGS}")
-    endif()
-endfunction(inference_api_test)
+    if (WITH_TESTING)
+        set(options "")
+        set(oneValueArgs "")
+        set(multiValueArgs ARGS)
+        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

+        set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+        cc_test(${TARGET_NAME}
+                SRCS ${TARGET_NAME}.cc
+                DEPS "${inference_deps}"
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+        if(inference_test_ARGS)
+            set_tests_properties(${TARGET_NAME}
+                    PROPERTIES DEPENDS "${inference_test_ARGS}")
+        endif()
+    endif(WITH_TESTING)
+endfunction(inference_api_test)

 cc_library(paddle_inference_api
    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})

-if(WITH_TESTING)
-    cc_test(test_paddle_inference_api
-            SRCS test_paddle_inference_api.cc
-            DEPS paddle_inference_api)
+cc_test(test_paddle_inference_api
+        SRCS test_paddle_inference_api.cc
+        DEPS paddle_inference_api)

-    inference_api_test(api_impl
-                       ARGS test_word2vec test_image_classification)
+inference_api_test(test_paddle_inference_api_impl
+                    ARGS test_word2vec test_image_classification)
+
+if (ANAKIN_FOUND)
+  nv_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+    DEPS ${inference_deps} protobuf)
+endif()
+
+if(WITH_TESTING)
+    add_subdirectory(demo)
 endif()
--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+inference_api_test(simple_on_word2vec ARGS test_word2vec)
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains a simple demo for how to take a model for inference.
+ */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+namespace demo {
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+void Main(bool use_gpu) {
+  //# 1. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    //# 2. Prepare input.
+    int64_t data[4] = {1, 2, 3, 4};
+
+    PaddleBuf buf{.data = data, .length = sizeof(data)};
+    PaddleTensor tensor{.name = "",
+                        .shape = std::vector<int>({4, 1}),
+                        .data = buf,
+                        .dtype = PaddleDType::INT64};
+
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+
+    //# 3. Run
+    std::vector<PaddleTensor> outputs;
+    CHECK(predictor->Run(slots, &outputs));
+
+    //# 4. Get output.
+    ASSERT_EQ(outputs.size(), 1UL);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length;
+    const size_t num_elements = outputs.front().data.length / sizeof(float);
+    // The outputs' buffers are in CPU memory.
+    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+    }
+  }
+}
+
+TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
+TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
+
+}  // namespace demo
+}  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at

-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0

-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 /*
 * This file contains the definition of a simple Inference API for Paddle.
@@ -40,14 +40,23 @@ struct PaddleBuf {
 struct PaddleTensor {
  std::string name;  // variable name.
  std::vector<int> shape;
+  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
  PaddleBuf data;  // blob of data.
  PaddleDType dtype;
 };

+enum class PaddleEngineKind {
+  kNative = 0,  // Use the native Fluid facility.
+  kAnakin,      // Use Anakin for inference.
+  // TODO(Superjomn) support following engines latter.
+  // kTensorRT,           // Use TensorRT for inference.
+  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
+  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+};
+
 /*
 * A simple Inference API for Paddle. Currently this API can be used by
 * non-sequence scenerios.
- * TODO(Superjomn) Support another API for NLP-related usages.
 */
 class PaddlePredictor {
 public:
@@ -69,15 +78,6 @@ class PaddlePredictor {
  // Destroy the Predictor.
  virtual ~PaddlePredictor() {}

-  enum class EngineKind {
-    kNative = -1,  // Use the native Fluid facility.
-    // TODO(Superjomn) support latter.
-    // kAnakin,             // Use Anakin for inference.
-    // kTensorRT,           // Use TensorRT for inference.
-    // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-    // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-  };
-
  // The common configs for all the predictors.
  struct Config {
    std::string model_dir;      // path to the model directory.
@@ -86,18 +86,31 @@ class PaddlePredictor {
 };

 struct NativeConfig : public PaddlePredictor::Config {
+  // GPU related fields.
  bool use_gpu{false};
-  int device;
-  float fraction_of_gpu_memory;
+  int device{0};
+  float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
+
  std::string prog_file;
  std::string param_file;
-  bool share_variables;
 };

-// A factory to help create difference predictor.
-template <
-    typename ConfigT,
-    PaddlePredictor::EngineKind engine = PaddlePredictor::EngineKind::kNative>
+// Configurations for Anakin engine.
+struct AnakinConfig : public PaddlePredictor::Config {
+  int device;
+  std::string model_file;
+  int max_batch_size{-1};
+};
+
+// A factory to help create different predictors.
+//
+// FOR EXTENSION DEVELOPER:
+// Different predictors are designated by config type and engine kind. Similar
+// configs can be merged, but there shouldn't be a huge config containing
+// different fields for more than one kind of predictors.
+//
+// Similarly, each engine kind should map to a unique predictor implementation.
+template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);

 }  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda.h>
+
+#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
+
+namespace paddle {
+
+PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
+    const AnakinConfig &config) {
+  CHECK(Init(config));
+}
+
+bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
+  // TODO(Superjomn) Tell anakin to support return code.
+  engine_.Build(config.model_file, config.max_batch_size);
+  return true;
+}
+
+bool PaddleInferenceAnakinPredictor::Run(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<PaddleTensor> *output_data) {
+  for (const auto &input : inputs) {
+    if (input.dtype != PaddleDType::FLOAT32) {
+      LOG(ERROR) << "Only support float type inputs. " << input.name
+                 << "'s type is not float";
+      return false;
+    }
+    engine_.SetInputFromCPU(
+        input.name, static_cast<float *>(input.data.data), input.data.length);
+  }
+
+  // TODO(Superjomn) Tell anakin to support return code.
+  engine_.Execute();
+
+  if (output_data->empty()) {
+    LOG(ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+  for (auto &output : *output_data) {
+    auto *tensor = engine_.GetOutputInGPU(output.name);
+    output.shape = tensor->shape();
+    // Copy data from GPU -> CPU
+    if (cudaMemcpy(output.data.data,
+                   tensor->data(),
+                   tensor->size(),
+                   cudaMemcpyDeviceToHost) != 0) {
+      LOG(ERROR) << "copy data from GPU to CPU error";
+      return false;
+    }
+  }
+  return true;
+}
+
+// TODO(Superjomn) To implement latter.
+std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
+  return nullptr;
+}
+
+// A factory to help create difference predictor.
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
+    const AnakinConfig &config) {
+  std::unique_ptr<PaddlePredictor> x(
+      new PaddleInferenceAnakinPredictor(config));
+  return x;
+};
+
+}  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the implementation of inference API with Anakin engine
+ * embeded, this API can only support Anakin models.
+ */
+
+#pragma once
+
+// NOTE This header file do not have namespace.
+// TODO(Superjomn) Tell Anakin to provide better APIs.
+#include <test/framework/net/paddle_api.h>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+class PaddleInferenceAnakinPredictor : public PaddlePredictor {
+ public:
+  PaddleInferenceAnakinPredictor(const AnakinConfig& config);
+
+  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
+  // should be allocated first.
+  // TODO(Superjomn) should unify all the behaviors of output_data accross all
+  // the engines.
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data) override;
+
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+ private:
+  bool Init(const AnakinConfig& config);
+
+  anakin::AnakinEngine<anakin::NV,
+                       anakin::saber::AK_FLOAT,
+                       anakin::Precision::FP32>
+      engine_;
+};
+
+}  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(inference, anakin) {
+  AnakinConfig config;
+
+  auto engine =
+      CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+}
+
+}  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at

-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0

-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #include <sys/time.h>
 #include <algorithm>
@@ -57,8 +57,7 @@ std::string num2str(T a) {
 bool NativePaddlePredictor::Init() {
  VLOG(3) << "Predictor::init()";

-  // TODO(panyx0718): Should CPU vs GPU device be decided by id?
-  if (config_.device >= 0) {
+  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
  } else {
    place_ = paddle::platform::CPUPlace();
@@ -85,11 +84,13 @@ bool NativePaddlePredictor::Init() {
  }
  ctx_ = executor_->Prepare(*inference_program_, 0);

-  // Create variables
-  // TODO(panyx0718): Why need to test share_variables here?
-  if (config_.share_variables) {
-    executor_->CreateVariables(*inference_program_, scope_.get(), 0);
-  }
+  // Create temporary variables first, so that the first batch do not need to
+  // create variables in the runtime. This is the logics of the old inference
+  // API.
+  // TODO(Superjomn) this should be modified when `Clone` is valid for
+  // multi-thread application.
+  executor_->CreateVariables(*inference_program_, scope_.get(), 0);
+
  // Get the feed_target_names and fetch_target_names
  feed_target_names_ = inference_program_->GetFeedTargetNames();
  fetch_target_names_ = inference_program_->GetFetchTargetNames();
@@ -124,7 +125,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                scope_.get(),
                                &feed_targets,
                                &fetch_targets,
-                                !config_.share_variables);
+                                false /* don't create variable eatch time */);
  if (!GetFetch(fetchs, output_data)) {
    LOG(ERROR) << "fail to get fetchs";
    return false;
@@ -242,11 +243,16 @@ bool NativePaddlePredictor::GetFetch(

 template <>
 std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<NativeConfig, PaddlePredictor::EngineKind::kNative>(
+CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
    const NativeConfig &config) {
  VLOG(3) << "create NativePaddlePredictor";
  if (config.use_gpu) {
    // 1. GPU memeroy
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory,
+        0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
    std::vector<std::string> flags;
    if (config.fraction_of_gpu_memory >= 0.0f ||
        config.fraction_of_gpu_memory <= 0.95f) {

--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -47,7 +47,6 @@ NativeConfig GetConfig() {
  config.fraction_of_gpu_memory = 0.15;
  config.use_gpu = true;
  config.device = 0;
-  config.share_variables = true;
  return config;
 }

@@ -75,7 +74,7 @@ TEST(paddle_inference_api_impl, word2vec) {
  ASSERT_EQ(outputs.size(), 1UL);
  size_t len = outputs[0].data.length;
  float* data = static_cast<float*>(outputs[0].data.data);
-  for (int j = 0; j < len / sizeof(float); ++j) {
+  for (size_t j = 0; j < len / sizeof(float); ++j) {
    ASSERT_LT(data[j], 1.0);
    ASSERT_GT(data[j], -1.0);
  }
@@ -93,7 +92,7 @@ TEST(paddle_inference_api_impl, word2vec) {
  TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1);

  float* lod_data = output1.data<float>();
-  for (size_t i = 0; i < output1.numel(); ++i) {
+  for (int i = 0; i < output1.numel(); ++i) {
    EXPECT_LT(lod_data[i] - data[i], 1e-3);
    EXPECT_GT(lod_data[i] - data[i], -1e-3);
  }

--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -169,17 +169,13 @@ void BlockDesc::Flush() {
  }

  if (need_update_) {
-    auto &op_field = *this->desc_->mutable_ops();
-    this->ClearPBOps();
-    op_field.Reserve(static_cast<int>(ops_.size()));
+    this->desc_->mutable_ops()->Clear();
    for (auto &op_desc : ops_) {
-      op_field.AddAllocated(op_desc->Proto());
+      this->desc_->mutable_ops()->Add()->CopyFrom(*op_desc->Proto());
    }
-    auto &var_field = *this->desc_->mutable_vars();
-    this->ClearPBVars();
-    var_field.Reserve(static_cast<int>(vars_.size()));
+    this->desc_->mutable_vars()->Clear();
    for (auto &var_desc : vars_) {
-      var_field.AddAllocated(var_desc.second->Proto());
+      this->desc_->mutable_vars()->Add()->CopyFrom(*var_desc.second->Proto());
    }
    need_update_ = false;
  }
@@ -217,22 +213,6 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
  }
 }

-void BlockDesc::ClearPBOps() {
-  auto ops = this->desc_->mutable_ops();
-  while (!ops->empty()) {
-    // we do not own the OpDesc, so release the ownership.
-    ops->ReleaseLast();
-  }
-}
-
-void BlockDesc::ClearPBVars() {
-  auto vars = this->desc_->mutable_vars();
-  while (!vars->empty()) {
-    // we do not own the VarDesc, so release the ownership.
-    vars->ReleaseLast();
-  }
-}
-
 void BlockDesc::SetForwardBlockID(int32_t forward_block_id) {
  PADDLE_ENFORCE(!desc_->has_forward_block_idx(),
                 "Parent block ID has been set to %d. Cannot set to %d",

--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -41,11 +41,6 @@ class BlockDesc {

  BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);

-  ~BlockDesc() {
-    this->ClearPBVars();
-    this->ClearPBOps();
-  }
-
  int32_t ID() const { return desc_->idx(); }

  int32_t Parent() const { return desc_->parent_idx(); }
@@ -113,10 +108,6 @@ class BlockDesc {

  ProgramDesc *Program() const { return this->prog_; }

- private:
-  void ClearPBOps();
-  void ClearPBVars();
-
 private:
  ProgramDesc *prog_;       // not_own
  proto::BlockDesc *desc_;  // not_own

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -220,8 +220,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
      has_fetch_operators(program.Block(0), *fetch_targets, fetch_holder_name);

  ProgramDesc* copy_program = const_cast<ProgramDesc*>(&program);
+  std::unique_ptr<ProgramDesc> unique_ptr_of_copy_program;
  if (!has_feed_ops || !has_fetch_ops) {
-    copy_program = std::unique_ptr<ProgramDesc>(new ProgramDesc(program)).get();
+    unique_ptr_of_copy_program.reset(new ProgramDesc(program));
+    copy_program = unique_ptr_of_copy_program.get();
  }

  auto* global_block = copy_program->MutableBlock(0);

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -8,3 +8,5 @@ nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
 nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -21,7 +21,8 @@ namespace tensorrt {
 class Conv2dOpConverter : public OpConverter {
 public:
  Conv2dOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
    LOG(INFO)
        << "convert a fluid conv2d op to tensorrt conv layer without bias";
  }

--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// Reorder the elements from istrides to ostrides, borrowed from TRT convert in
+// tensorflow.
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318
+template <typename T>
+void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
+              T* odata, nvinfer1::DimsHW ostrides) {
+  for (int h = 0; h < shape.h(); ++h) {
+    for (int w = 0; w < shape.w(); ++w) {
+      odata[h * ostrides.h() + w * ostrides.w()] =
+          idata[h * ostrides.h() + w * ostrides.w()];
+    }
+  }
+}
+
+// Reorder the data layout from CK to KC.
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+                   TensorRTEngine::Weight* oweights) {
+  int c = iweights.dims[0];
+  int k = iweights.dims[1];
+  oweights->dims.assign({k, c});
+  nvinfer1::DimsHW istrides = {1, k};
+  nvinfer1::DimsHW ostrides = {c, 1};
+  Reorder2({k, c}, static_cast<float const*>(iweights.get().values), istrides,
+           static_cast<float*>(const_cast<void*>(oweights->get().values)),
+           ostrides);
+}
+
+/*
+ * FC converter convert a MUL op in Fluid to a FC layer in TRT.
+ */
+class FcOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
+    VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    // Declare inputs
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    // This may trigger a GPU->CPU copy, because TRT's weight can only be
+    // assigned from CPU memory, that can't be avoided.
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
+    size_t n_output = Y_t->dims()[1];
+
+    framework::LoDTensor tmp;
+    tmp.Resize(Y_t->dims());
+    memcpy(tmp.mutable_data<float>(platform::CPUPlace()), Y_t->data<float>(),
+           Y_t->dims()[0] * Y_t->dims()[1]);
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(weight_data),
+                                  Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(tmp.data<float>()),
+                                      Y_t->memory_size() / sizeof(float));
+    weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
+    tmp_weight.dims = weight.dims;
+
+    // The data layout of TRT FC layer's weight is different from fluid's FC,
+    // need to reorder the elements.
+    ReorderCKtoKC(tmp_weight, &weight);
+
+    // Currently, the framework can only handle one fluid op -> one TRT layer,
+    // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
+    // handle `mul`, leave `add` as another layer.
+    // DEBUG
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
+                                       *const_cast<nvinfer1::ITensor*>(X),
+                                       n_output, weight.get(), bias.get());
+
+    auto output_name = op_desc.Output("Out").front();
+    engine_->DeclareOutput(layer, 0, output_name);
+  }
+};
+
+REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(mul);
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -24,8 +24,9 @@ namespace tensorrt {
 class MulOpConverter : public OpConverter {
 public:
  MulOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
-    VLOG(4) << "convert a fluid mul op to tensorrt fc layer without bias";
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
+    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";

    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -31,27 +31,42 @@ namespace tensorrt {
 class OpConverter {
 public:
  OpConverter() {}
-  virtual void operator()(const framework::proto::OpDesc& op) {}

-  void Run(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
-    std::string type = op.type();
-    auto* it = Registry<OpConverter>::Lookup(type);
-    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", type);
-    it->SetEngine(engine);
-    (*it)(op);
-  }
+  // Converter logic for an op.
+  virtual void operator()(const framework::proto::OpDesc& op,
+                          const framework::Scope& scope) {}
+
+  // Convert a single fluid operaotr and add the corresponding layer to TRT.
+  void ConvertOp(const framework::proto::OpDesc& op,
+                 const std::unordered_set<std::string>& parameters,
+                 const framework::Scope& scope, TensorRTEngine* engine) {
+    framework::OpDesc op_desc(op, nullptr);
+
+    OpConverter* it{nullptr};

-  // convert fluid op to tensorrt layer
-  void ConvertOp(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
-    OpConverter::Run(op, engine);
+    if (op_desc.Type() == "mul") {
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      std::string Y = op_desc.Input("Y")[0];
+      if (parameters.count(Y)) {
+        it = Registry<OpConverter>::Lookup("fc");
+      }
+    }
+    if (!it) {
+      it = Registry<OpConverter>::Lookup(op_desc.Type());
+    }
+    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                            op_desc.Type());
+    it->SetEngine(engine);
+    (*it)(op, scope);
  }

  // convert fluid block to tensorrt network
  void ConvertBlock(const framework::proto::BlockDesc& block,
-                    TensorRTEngine* engine) {
+                    const std::unordered_set<std::string>& parameters,
+                    const framework::Scope& scope, TensorRTEngine* engine) {
    for (int i = 0; i < block.ops_size(); i++) {
      const auto& op = block.ops(i);
-      OpConverter::Run(op, engine);
+      ConvertOp(op, parameters, scope, engine);
    }
  }


--- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(fc_op, test) {
+  std::unordered_set<std::string> parameters({"mul-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(20, parameters, scope, 1000);
+
+  validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1));
+  validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mul");
+  desc.SetInput("X", {"mul-X"});
+  desc.SetInput("Y", {"mul-Y"});
+  desc.SetOutput("Out", {"mul-Out"});
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(10);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -21,7 +21,9 @@ namespace inference {
 namespace tensorrt {

 TEST(MulOpConverter, main) {
-  TRTConvertValidation validator(10, 1000);
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
  validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
  validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));

--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"

 namespace paddle {
 namespace inference {
@@ -27,7 +28,9 @@ TEST(OpConverter, ConvertBlock) {
  conv2d_op->SetType("conv2d");

  OpConverter converter;
-  converter.ConvertBlock(*block->Proto(), nullptr /*TensorRTEngine*/);
+  framework::Scope scope;
+  converter.ConvertBlock(*block->Proto(), {}, scope,
+                         nullptr /*TensorRTEngine*/);
 }

 }  // namespace tensorrt

--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -61,7 +61,10 @@ class TRTConvertValidation {
 public:
  TRTConvertValidation() = delete;

-  explicit TRTConvertValidation(int batch_size, int workspace_size = 1024) {
+  TRTConvertValidation(int batch_size,
+                       const std::unordered_set<std::string>& parameters,
+                       framework::Scope& scope, int workspace_size = 1 << 10)
+      : parameters_(parameters), scope_(scope) {
    // create engine.
    engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
    engine_->InitNetwork();
@@ -76,19 +79,22 @@ class TRTConvertValidation {
    engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
  }

+  // Declare a parameter varaible in the scope.
+  void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+  }
+
  void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
    DeclVar(name, dims);
  }

+  // Declare a variable in a fluid Scope.
  void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
    platform::CPUPlace place;
    platform::CPUDeviceContext ctx(place);

    // Init Fluid tensor.
-    std::vector<int> dim_vec(dims.nbDims);
-    for (int i = 0; i < dims.nbDims; i++) {
-      dim_vec[i] = dims.d[i];
-    }
+    std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
    auto* x = scope_.Var(name);
    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
    x_tensor->Resize(framework::make_ddim(dim_vec));
@@ -99,7 +105,7 @@ class TRTConvertValidation {
    op_ = framework::OpRegistry::CreateOp(desc);

    OpConverter op_converter;
-    op_converter.ConvertOp(desc, engine_.get());
+    op_converter.ConvertOp(desc, parameters_, scope_, engine_.get());

    engine_->FreezeNetwork();

@@ -108,11 +114,13 @@ class TRTConvertValidation {

    // Set Inputs.
    for (const auto& input : op_desc_->InputArgumentNames()) {
+      if (parameters_.count(input)) continue;
      auto* var = scope_.FindVar(input);
      PADDLE_ENFORCE(var);
      auto tensor = var->GetMutable<framework::LoDTensor>();
+
      engine_->SetInputFromCPU(
-          input, static_cast<void*>(tensor->data<float>()),
+          input, static_cast<void*>(tensor->data<void>()),
          sizeof(float) *
              analysis::AccuDims(tensor->dims(), tensor->dims().size()));
    }
@@ -120,18 +128,21 @@ class TRTConvertValidation {

  void Execute(int batch_size) {
    // Execute Fluid Op
-    // Execute TRT
    platform::CPUPlace place;
    platform::CPUDeviceContext ctx(place);
-    engine_->Execute(batch_size);
-
    op_->Run(scope_, place);
+    // Execute TRT.
+    engine_->Execute(batch_size);
+    cudaStreamSynchronize(*engine_->stream());

    ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
+    const size_t output_space_size = 200;
    for (const auto& output : op_desc_->OutputArgumentNames()) {
      std::vector<float> fluid_out;
-      std::vector<float> trt_out(200);
-      engine_->GetOutputInCPU(output, &trt_out[0], 200 * sizeof(float));
+      std::vector<float> trt_out(output_space_size);
+      engine_->GetOutputInCPU(output, &trt_out[0],
+                              output_space_size * sizeof(float));
+      cudaStreamSynchronize(*engine_->stream());

      auto* var = scope_.FindVar(output);
      auto tensor = var->GetMutable<framework::LoDTensor>();
@@ -139,7 +150,7 @@ class TRTConvertValidation {
      // Compare two output
      ASSERT_FALSE(fluid_out.empty());
      for (size_t i = 0; i < fluid_out.size(); i++) {
-        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 0.001);
+        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6);
      }
    }
  }
@@ -149,9 +160,10 @@ class TRTConvertValidation {
 private:
  std::unique_ptr<TensorRTEngine> engine_;
  cudaStream_t stream_;
-  framework::Scope scope_;
  std::unique_ptr<framework::OperatorBase> op_;
  std::unique_ptr<framework::OpDesc> op_desc_;
+  const std::unordered_set<std::string>& parameters_;
+  framework::Scope& scope_;
 };

 }  // namespace tensorrt

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -106,6 +106,7 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
                    name);

  auto* output = layer->getOutput(offset);
+  SetITensor(name, output);
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
  infer_network_->markOutput(*output);

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -37,13 +37,15 @@ class TensorRTEngine : public EngineBase {
  // Weight is model parameter.
  class Weight {
   public:
-    Weight(nvinfer1::DataType dtype, void* value, int num_elem) {
+    Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
      w_.type = dtype;
      w_.values = value;
      w_.count = num_elem;
    }
    const nvinfer1::Weights& get() { return w_; }

+    std::vector<int64_t> dims;
+
   private:
    nvinfer1::Weights w_;
  };

--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -38,3 +38,11 @@ inference_test(recommender_system)
 #inference_test(rnn_encoder_decoder)
 #inference_test(understand_sentiment ARGS conv)
 inference_test(word2vec)
+
+# This is an unly work around to make this test run
+# TODO(TJ): clean me up
+cc_test(test_inference_nlp
+  SRCS test_inference_nlp.cc
+  DEPS paddle_fluid
+  ARGS
+  --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <sys/time.h>
+#include <time.h>
+#include <fstream>
+#include <thread>  // NOLINT
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+#ifdef PADDLE_WITH_MKLML
+#include <mkl_service.h>
+#include <omp.h>
+#endif
+
+DEFINE_string(model_path, "", "Directory of the inference model.");
+DEFINE_string(data_file, "", "File of input index data.");
+DEFINE_int32(repeat, 100, "Running the inference program repeat times");
+DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
+DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
+DEFINE_int32(num_threads, 1, "Number of threads should be used");
+
+inline double GetCurrentMs() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
+}
+
+// This function just give dummy data for recognize_digits model.
+size_t DummyData(std::vector<paddle::framework::LoDTensor>* out) {
+  paddle::framework::LoDTensor input;
+  SetupTensor<float>(&input, {1, 1, 28, 28}, -1.f, 1.f);
+  out->emplace_back(input);
+  return 1;
+}
+
+// Load the input word index data from file and save into LodTensor.
+// Return the size of words.
+size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
+                const std::string& filename) {
+  if (filename.empty()) {
+    return DummyData(out);
+  }
+
+  size_t sz = 0;
+  std::fstream fin(filename);
+  std::string line;
+  out->clear();
+  while (getline(fin, line)) {
+    std::istringstream iss(line);
+    std::vector<int64_t> ids;
+    std::string field;
+    while (getline(iss, field, ' ')) {
+      ids.push_back(stoi(field));
+    }
+    if (ids.size() >= 1024) {
+      // Synced with NLP guys, they will ignore input larger then 1024
+      continue;
+    }
+
+    paddle::framework::LoDTensor words;
+    paddle::framework::LoD lod{{0, ids.size()}};
+    words.set_lod(lod);
+    int64_t* pdata = words.mutable_data<int64_t>(
+        {static_cast<int64_t>(ids.size()), 1}, paddle::platform::CPUPlace());
+    memcpy(pdata, ids.data(), words.numel() * sizeof(int64_t));
+    out->emplace_back(words);
+    sz += ids.size();
+  }
+  return sz;
+}
+
+// Split input data samples into small pieces jobs as balanced as possible,
+// according to the number of threads.
+void SplitData(
+    const std::vector<paddle::framework::LoDTensor>& datasets,
+    std::vector<std::vector<const paddle::framework::LoDTensor*>>* jobs,
+    const int num_threads) {
+  size_t s = 0;
+  jobs->resize(num_threads);
+  while (s < datasets.size()) {
+    for (auto it = jobs->begin(); it != jobs->end(); it++) {
+      it->emplace_back(&datasets[s]);
+      s++;
+      if (s >= datasets.size()) {
+        break;
+      }
+    }
+  }
+}
+
+void ThreadRunInfer(
+    const int tid, paddle::framework::Executor* executor,
+    paddle::framework::Scope* scope,
+    const std::unique_ptr<paddle::framework::ProgramDesc>& inference_program,
+    const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
+  auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
+      new paddle::framework::ProgramDesc(*inference_program));
+  auto& sub_scope = scope->NewScope();
+
+  std::string feed_holder_name = "feed_" + paddle::string::to_string(tid);
+  std::string fetch_holder_name = "fetch_" + paddle::string::to_string(tid);
+  copy_program->SetFeedHolderName(feed_holder_name);
+  copy_program->SetFetchHolderName(fetch_holder_name);
+
+  const std::vector<std::string>& feed_target_names =
+      copy_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      copy_program->GetFetchTargetNames();
+
+  PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  paddle::framework::LoDTensor outtensor;
+  fetch_targets[fetch_target_names[0]] = &outtensor;
+
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+
+  auto& inputs = jobs[tid];
+  auto start_ms = GetCurrentMs();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    feed_targets[feed_target_names[0]] = inputs[i];
+    executor->Run(*copy_program, &sub_scope, &feed_targets, &fetch_targets,
+                  true /*create_local_scope*/, true /*create_vars*/,
+                  feed_holder_name, fetch_holder_name);
+  }
+  auto stop_ms = GetCurrentMs();
+  scope->DeleteScope(&sub_scope);
+  LOG(INFO) << "Tid: " << tid << ", process " << inputs.size()
+            << " samples, avg time per sample: "
+            << (stop_ms - start_ms) / inputs.size() << " ms";
+}
+
+TEST(inference, nlp) {
+  if (FLAGS_model_path.empty()) {
+    LOG(FATAL) << "Usage: ./example --model_path=path/to/your/model";
+  }
+  if (FLAGS_data_file.empty()) {
+    LOG(WARNING) << "No data file provided, will use dummy data!"
+                 << "Note: if you use nlp model, please provide data file.";
+  }
+  LOG(INFO) << "Model Path: " << FLAGS_model_path;
+  LOG(INFO) << "Data File: " << FLAGS_data_file;
+
+  std::vector<paddle::framework::LoDTensor> datasets;
+  size_t num_total_words = LoadData(&datasets, FLAGS_data_file);
+  LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size();
+  LOG(INFO) << "Total number of words: " << num_total_words;
+
+  const bool model_combined = false;
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // 1. Define place, executor, scope
+  auto place = paddle::platform::CPUPlace();
+  auto executor = paddle::framework::Executor(place);
+  std::unique_ptr<paddle::framework::Scope> scope(
+      new paddle::framework::Scope());
+
+  // 2. Initialize the inference_program and load parameters
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+  inference_program =
+      InitProgram(&executor, scope.get(), FLAGS_model_path, model_combined);
+  if (FLAGS_use_mkldnn) {
+    EnableMKLDNN(inference_program);
+  }
+
+#ifdef PADDLE_WITH_MKLML
+  // only use 1 thread number per std::thread
+  omp_set_dynamic(0);
+  omp_set_num_threads(1);
+  mkl_set_num_threads(1);
+#endif
+
+  double start_ms = 0, stop_ms = 0;
+  if (FLAGS_num_threads > 1) {
+    std::vector<std::vector<const paddle::framework::LoDTensor*>> jobs;
+    SplitData(datasets, &jobs, FLAGS_num_threads);
+    std::vector<std::unique_ptr<std::thread>> threads;
+    start_ms = GetCurrentMs();
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads.emplace_back(
+          new std::thread(ThreadRunInfer, i, &executor, scope.get(),
+                          std::ref(inference_program), std::ref(jobs)));
+    }
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads[i]->join();
+    }
+    stop_ms = GetCurrentMs();
+  } else {
+    if (FLAGS_prepare_vars) {
+      executor.CreateVariables(*inference_program, scope.get(), 0);
+    }
+    // always prepare context
+    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
+    ctx = executor.Prepare(*inference_program, 0);
+
+    // preapre fetch
+    const std::vector<std::string>& fetch_target_names =
+        inference_program->GetFetchTargetNames();
+    PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+    paddle::framework::LoDTensor outtensor;
+    fetch_targets[fetch_target_names[0]] = &outtensor;
+
+    // prepare feed
+    const std::vector<std::string>& feed_target_names =
+        inference_program->GetFeedTargetNames();
+    PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+    std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+
+    // feed data and run
+    start_ms = GetCurrentMs();
+    for (size_t i = 0; i < datasets.size(); ++i) {
+      feed_targets[feed_target_names[0]] = &(datasets[i]);
+      executor.RunPreparedContext(ctx.get(), scope.get(), &feed_targets,
+                                  &fetch_targets, !FLAGS_prepare_vars);
+    }
+    stop_ms = GetCurrentMs();
+    LOG(INFO) << "Tid: 0, process " << datasets.size()
+              << " samples, avg time per sample: "
+              << (stop_ms - start_ms) / datasets.size() << " ms";
+  }
+  LOG(INFO) << "Total inference time with " << FLAGS_num_threads
+            << " threads : " << (stop_ms - start_ms) / 1000.0
+            << " sec, QPS: " << datasets.size() / ((stop_ms - start_ms) / 1000);
+}
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -222,35 +222,35 @@ struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
 };

 template <typename T>
-using ReluMkldnnFunctor =
+using ReluMKLDNNFunctor =
    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;

 template <typename T>
-using TanhMkldnnFunctor =
+using TanhMKLDNNFunctor =
    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;

 template <typename T>
-using SqrtMkldnnFunctor =
+using SqrtMKLDNNFunctor =
    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;

 template <typename T>
-using AbsMkldnnFunctor =
+using AbsMKLDNNFunctor =
    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;

 template <typename T>
-using ReluMkldnnGradFunctor =
+using ReluMKLDNNGradFunctor =
    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;

 template <typename T>
-using TanhMkldnnGradFunctor =
+using TanhMKLDNNGradFunctor =
    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;

 template <typename T>
-using SqrtMkldnnGradFunctor =
+using SqrtMKLDNNGradFunctor =
    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;

 template <typename T>
-using AbsMkldnnGradFunctor =
+using AbsMKLDNNGradFunctor =
    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
 }  // namespace operators
 }  // namespace paddle
@@ -265,9 +265,9 @@ namespace ops = paddle::operators;
      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);

 #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)            \
-  __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \
-  __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \
-  __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \
-  __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
+  __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
+  __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor); \
+  __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradFunctor); \
+  __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor);

 FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -71,8 +71,6 @@ class AsyncGRPCServer final : public RPCServer {
  std::unique_ptr<::grpc::Server> server_;

  // condition of the sub program
-  std::mutex barrier_mutex_;
-  mutable int barrier_cond_step_;
  std::condition_variable barrier_condition_;

  std::mutex mutex_ready_;

--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -91,6 +91,10 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
                         "the code type used with the target box")
        .SetDefault("encode_center_size")
        .InEnum({"encode_center_size", "decode_center_size"});
+    AddAttr<bool>("box_normalized",
+                  "(bool, default true) "
+                  "whether treat the priorbox as a noramlized box")
+        .SetDefault(true);
    AddOutput("OutputBox",
              "(LoDTensor or Tensor) "
              "When code_type is 'encode_center_size', the output tensor of "

--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@@ -20,15 +20,16 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
                                       const T* prior_box_var_data,
                                       const T* target_box_data, const int row,
                                       const int col, const int len,
-                                       T* output) {
+                                       const bool normalized, T* output) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < row * col) {
    const int row_idx = idx / col;
    const int col_idx = idx % col;
-    T prior_box_width =
-        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
-    T prior_box_height =
-        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_width = prior_box_data[col_idx * len + 2] -
+                        prior_box_data[col_idx * len] + (normalized == false);
+    T prior_box_height = prior_box_data[col_idx * len + 3] -
+                         prior_box_data[col_idx * len + 1] +
+                         (normalized == false);
    T prior_box_center_x =
        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
@@ -41,10 +42,11 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
    T target_box_center_y = (target_box_data[row_idx * len + 3] +
                             target_box_data[row_idx * len + 1]) /
                            2;
-    T target_box_width =
-        target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
-    T target_box_height =
-        target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
+    T target_box_width = target_box_data[row_idx * len + 2] -
+                         target_box_data[row_idx * len] + (normalized == false);
+    T target_box_height = target_box_data[row_idx * len + 3] -
+                          target_box_data[row_idx * len + 1] +
+                          (normalized == false);

    output[idx * len] = (target_box_center_x - prior_box_center_x) /
                        prior_box_width / prior_box_var_data[col_idx * len];
@@ -63,14 +65,15 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
                                       const T* prior_box_var_data,
                                       const T* target_box_data, const int row,
                                       const int col, const int len,
-                                       T* output) {
+                                       const bool normalized, T* output) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < row * col) {
    const int col_idx = idx % col;
-    T prior_box_width =
-        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
-    T prior_box_height =
-        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_width = prior_box_data[col_idx * len + 2] -
+                        prior_box_data[col_idx * len] + (normalized == false);
+    T prior_box_height = prior_box_data[col_idx * len + 3] -
+                         prior_box_data[col_idx * len + 1] +
+                         (normalized == false);
    T prior_box_center_x =
        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
@@ -93,8 +96,10 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,

    output[idx * len] = target_box_center_x - target_box_width / 2;
    output[idx * len + 1] = target_box_center_y - target_box_height / 2;
-    output[idx * len + 2] = target_box_center_x + target_box_width / 2;
-    output[idx * len + 3] = target_box_center_y + target_box_height / 2;
+    output[idx * len + 2] =
+        target_box_center_x + target_box_width / 2 - (normalized == false);
+    output[idx * len + 3] =
+        target_box_center_y + target_box_height / 2 - (normalized == false);
  }
 }

@@ -128,14 +133,15 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
    T* output = output_box->data<T>();

    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
    if (code_type == BoxCodeType::kEncodeCenterSize) {
      EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          output);
+          normalized, output);
    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
      DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          output);
+          normalized, output);
    }
  }
 };

--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -34,7 +34,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
  void EncodeCenterSize(const framework::Tensor& target_box,
                        const framework::Tensor& prior_box,
                        const framework::Tensor& prior_box_var,
-                        T* output) const {
+                        const bool normalized, T* output) const {
    int64_t row = target_box.dims()[0];
    int64_t col = prior_box.dims()[0];
    int64_t len = prior_box.dims()[1];
@@ -44,10 +44,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {

    for (int64_t i = 0; i < row; ++i) {
      for (int64_t j = 0; j < col; ++j) {
-        T prior_box_width =
-            prior_box_data[j * len + 2] - prior_box_data[j * len];
-        T prior_box_height =
-            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_width = prior_box_data[j * len + 2] -
+                            prior_box_data[j * len] + (normalized == false);
+        T prior_box_height = prior_box_data[j * len + 3] -
+                             prior_box_data[j * len + 1] +
+                             (normalized == false);
        T prior_box_center_x =
            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
        T prior_box_center_y =
@@ -57,10 +58,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
            (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
        T target_box_center_y =
            (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-        T target_box_width =
-            target_box_data[i * len + 2] - target_box_data[i * len];
-        T target_box_height =
-            target_box_data[i * len + 3] - target_box_data[i * len + 1];
+        T target_box_width = target_box_data[i * len + 2] -
+                             target_box_data[i * len] + (normalized == false);
+        T target_box_height = target_box_data[i * len + 3] -
+                              target_box_data[i * len + 1] +
+                              (normalized == false);

        size_t offset = i * col * len + j * len;
        output[offset] = (target_box_center_x - prior_box_center_x) /
@@ -79,7 +81,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
  void DecodeCenterSize(const framework::Tensor& target_box,
                        const framework::Tensor& prior_box,
                        const framework::Tensor& prior_box_var,
-                        T* output) const {
+                        const bool normalized, T* output) const {
    int64_t row = target_box.dims()[0];
    int64_t col = prior_box.dims()[0];
    int64_t len = prior_box.dims()[1];
@@ -91,10 +93,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
    for (int64_t i = 0; i < row; ++i) {
      for (int64_t j = 0; j < col; ++j) {
        size_t offset = i * col * len + j * len;
-        T prior_box_width =
-            prior_box_data[j * len + 2] - prior_box_data[j * len];
-        T prior_box_height =
-            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_width = prior_box_data[j * len + 2] -
+                            prior_box_data[j * len] + (normalized == false);
+        T prior_box_height = prior_box_data[j * len + 3] -
+                             prior_box_data[j * len + 1] +
+                             (normalized == false);
        T prior_box_center_x =
            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
        T prior_box_center_y =
@@ -116,8 +119,10 @@ class BoxCoderKernel : public framework::OpKernel<T> {

        output[offset] = target_box_center_x - target_box_width / 2;
        output[offset + 1] = target_box_center_y - target_box_height / 2;
-        output[offset + 2] = target_box_center_x + target_box_width / 2;
-        output[offset + 3] = target_box_center_y + target_box_height / 2;
+        output[offset + 2] =
+            target_box_center_x + target_box_width / 2 - (normalized == false);
+        output[offset + 3] =
+            target_box_center_y + target_box_height / 2 - (normalized == false);
      }
    }
  }
@@ -139,11 +144,14 @@ class BoxCoderKernel : public framework::OpKernel<T> {
    output_box->mutable_data<T>({row, col, len}, context.GetPlace());

    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
    T* output = output_box->data<T>();
    if (code_type == BoxCodeType::kEncodeCenterSize) {
-      EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+      EncodeCenterSize(*target_box, *prior_box, *prior_box_var, normalized,
+                       output);
    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+      DecodeCenterSize(*target_box, *prior_box, *prior_box_var, normalized,
+                       output);
    }
  }
 };

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -222,8 +222,8 @@ static void FillRequestCtx(detail::RequestHandler *h, framework::Scope *scope,
  h->SetDevCtx(dev_ctx);
  h->SetExecutor(executor);
  h->SetProgram(program);
-  h->SetPrefetchPreparedCtx(std::move(
-      std::unique_ptr<framework::ExecutorPrepareContext>(prefetch_ctx)));
+  h->SetPrefetchPreparedCtx(
+      std::unique_ptr<framework::ExecutorPrepareContext>(prefetch_ctx));
  h->SetRPCServer(rpc_server);
 }


--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -26,7 +26,11 @@ class MultiFileReader : public framework::ReaderBase {
  MultiFileReader(const std::vector<std::string>& file_names,
                  const std::vector<framework::DDim>& dims, size_t thread_num,
                  size_t buffer_size)
-      : file_names_(file_names), dims_(dims), buffer_size_(buffer_size) {
+      : buffer_size_(buffer_size) {
+    readers_.reserve(file_names.size());
+    for (const std::string& f_name : file_names) {
+      readers_.emplace_back(CreateReaderByFileName(f_name, dims));
+    }
    prefetchers_.resize(thread_num);
    StartNewScheduler();
  }
@@ -40,14 +44,13 @@ class MultiFileReader : public framework::ReaderBase {
  void StartNewScheduler();
  void EndScheduler();
  void ScheduleThreadFunc();
-  void PrefetchThreadFunc(std::string file_name, size_t thread_idx);
+  void PrefetchThreadFunc(size_t reader_idx, size_t thread_idx);

-  std::vector<std::string> file_names_;
-  std::vector<framework::DDim> dims_;
+  std::vector<std::unique_ptr<framework::ReaderBase>> readers_;
  std::thread scheduler_;
  std::vector<std::thread> prefetchers_;
  size_t buffer_size_;
-  reader::BlockingQueue<size_t>* waiting_file_idx_;
+  reader::BlockingQueue<size_t>* waiting_reader_idx_;
  reader::BlockingQueue<size_t>* available_thread_idx_;
  reader::BlockingQueue<std::vector<framework::LoDTensor>>* buffer_;
 };
@@ -65,15 +68,15 @@ void MultiFileReader::ReInit() {

 void MultiFileReader::StartNewScheduler() {
  size_t thread_num = prefetchers_.size();
-  waiting_file_idx_ = new reader::BlockingQueue<size_t>(file_names_.size());
+  waiting_reader_idx_ = new reader::BlockingQueue<size_t>(readers_.size());
  available_thread_idx_ = new reader::BlockingQueue<size_t>(thread_num);
  buffer_ = new reader::BlockingQueue<std::vector<framework::LoDTensor>>(
      buffer_size_);

-  for (size_t i = 0; i < file_names_.size(); ++i) {
-    waiting_file_idx_->Send(i);
+  for (size_t i = 0; i < readers_.size(); ++i) {
+    waiting_reader_idx_->Send(i);
  }
-  waiting_file_idx_->Close();
+  waiting_reader_idx_->Close();
  for (size_t i = 0; i < thread_num; ++i) {
    available_thread_idx_->Send(i);
  }
@@ -84,13 +87,13 @@ void MultiFileReader::StartNewScheduler() {
 void MultiFileReader::EndScheduler() {
  available_thread_idx_->Close();
  buffer_->Close();
-  waiting_file_idx_->Close();
+  waiting_reader_idx_->Close();
  if (scheduler_.joinable()) {
    scheduler_.join();
  }
  delete buffer_;
  delete available_thread_idx_;
-  delete waiting_file_idx_;
+  delete waiting_reader_idx_;
 }

 void MultiFileReader::ScheduleThreadFunc() {
@@ -102,12 +105,11 @@ void MultiFileReader::ScheduleThreadFunc() {
    if (prefetcher.joinable()) {
      prefetcher.join();
    }
-    size_t file_idx;
-    if (waiting_file_idx_->Receive(&file_idx)) {
+    size_t reader_idx;
+    if (waiting_reader_idx_->Receive(&reader_idx)) {
      // Still have files to read. Start a new prefetch thread.
-      std::string file_name = file_names_[file_idx];
-      prefetcher = std::thread([this, file_name, thread_idx] {
-        PrefetchThreadFunc(file_name, thread_idx);
+      prefetcher = std::thread([this, reader_idx, thread_idx] {
+        PrefetchThreadFunc(reader_idx, thread_idx);
      });
    } else {
      // No more file to read.
@@ -129,23 +131,22 @@ void MultiFileReader::ScheduleThreadFunc() {
  VLOG(5) << "MultiFileReader schedule thread terminates.";
 }

-void MultiFileReader::PrefetchThreadFunc(std::string file_name,
-                                         size_t thread_idx) {
-  VLOG(5) << "The prefetch thread of file '" << file_name << "' starts.";
-  std::unique_ptr<framework::ReaderBase> reader =
-      CreateReaderByFileName(file_name, dims_);
+void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) {
+  VLOG(5) << "The prefetch thread of file idx '" << reader_idx << "' starts.";
+  std::unique_ptr<framework::ReaderBase>& reader = readers_[reader_idx];
  while (true) {
    std::vector<framework::LoDTensor> ins;
    reader->ReadNext(&ins);
    if (ins.empty()) {
+      reader->ReInit();
      break;
    }
    try {
      buffer_->Send(std::move(ins));
    } catch (paddle::platform::EnforceNotMet e) {
      VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
-                 "thread of file '"
-              << file_name << "' will terminate.";
+                 "thread of file idx '"
+              << reader_idx << "' will terminate.";
      break;
    }
  }
@@ -154,7 +155,8 @@ void MultiFileReader::PrefetchThreadFunc(std::string file_name,
    VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
               "Fail to send thread_idx.";
  }
-  VLOG(5) << "The prefetch thread of file '" << file_name << "' terminates.";
+  VLOG(5) << "The prefetch thread of file idx '" << reader_idx
+          << "' terminates.";
 }

 class OpenFilesOp : public framework::OperatorBase {

--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -114,7 +114,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
        int64_t id_index = param.Index(grad.rows()[i]);
        PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
                          "id should be in the table");
-        for (size_t j = 0; j < grad_row_width; j++) {
+        for (int64_t j = 0; j < grad_row_width; j++) {
          out_data[id_index * grad_row_width + j] -=
              lr[0] * grad_data[i * grad_row_width + j];
        }

--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -31,8 +31,9 @@ void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
  auto max_workspace = context.Attr<int>("max_workspace");
  engine_.reset(new inference::tensorrt::TensorRTEngine(
      max_batch_, max_workspace, nullptr));
+  // TODO(Superjomn) parameters should be passed after analysised from outside.
  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block, engine_.get());
+      block, {}, context.scope(), engine_.get());
  engine_->FreezeNetwork();
 }


--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -127,6 +127,7 @@ double Event::CpuElapsedMs(const Event& e) const {

 double Event::CudaElapsedMs(const Event& e) const {
 #ifdef PADDLE_WITH_CUDA
+  if (!has_cuda_) return 0.0;
  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
  PADDLE_ENFORCE(e.device() == device());
  PADDLE_ENFORCE(cudaEventSynchronize(event_));

--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
@@ -119,40 +119,56 @@ bool Chunk::Write(std::ostream& os, Compressor ct) const {
 }

 bool Chunk::Parse(std::istream& sin) {
-  Header hdr;
-  bool ok = hdr.Parse(sin);
+  ChunkParser parser(sin);
+  if (!parser.Init()) {
+    return false;
+  }
+  Clear();
+  while (parser.HasNext()) {
+    Add(parser.Next());
+  }
+  return true;
+}
+
+ChunkParser::ChunkParser(std::istream& sin) : in_(sin) {}
+bool ChunkParser::Init() {
+  pos_ = 0;
+  bool ok = header_.Parse(in_);
  if (!ok) {
    return ok;
  }
-  auto beg_pos = sin.tellg();
-  uint32_t crc = Crc32Stream(sin, hdr.CompressSize());
-  PADDLE_ENFORCE_EQ(hdr.Checksum(), crc);
-  Clear();
-  sin.seekg(beg_pos, sin.beg);
-  std::unique_ptr<std::istream> compressed_stream;
-  switch (hdr.CompressType()) {
+  auto beg_pos = in_.tellg();
+  uint32_t crc = Crc32Stream(in_, header_.CompressSize());
+  PADDLE_ENFORCE_EQ(header_.Checksum(), crc);
+  in_.seekg(beg_pos, in_.beg);
+
+  switch (header_.CompressType()) {
    case Compressor::kNoCompress:
      break;
    case Compressor::kSnappy:
-      compressed_stream.reset(new snappy::iSnappyStream(sin));
+      compressed_stream_.reset(new snappy::iSnappyStream(in_));
      break;
    default:
      PADDLE_THROW("Not implemented");
  }
+  return true;
+}

-  std::istream& stream = compressed_stream ? *compressed_stream : sin;
+bool ChunkParser::HasNext() const { return pos_ < header_.NumRecords(); }

-  for (uint32_t i = 0; i < hdr.NumRecords(); ++i) {
-    uint32_t rec_len;
-    stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
-    std::string buf;
-    buf.resize(rec_len);
-    stream.read(&buf[0], rec_len);
-    PADDLE_ENFORCE_EQ(rec_len, stream.gcount());
-    Add(buf);
+std::string ChunkParser::Next() {
+  if (!HasNext()) {
+    return "";
  }
-  return true;
+  ++pos_;
+  std::istream& stream = compressed_stream_ ? *compressed_stream_ : in_;
+  uint32_t rec_len;
+  stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
+  std::string buf;
+  buf.resize(rec_len);
+  stream.read(&buf[0], rec_len);
+  PADDLE_ENFORCE_EQ(rec_len, stream.gcount());
+  return buf;
 }
-
 }  // namespace recordio
 }  // namespace paddle
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include <memory>
 #include <string>
 #include <vector>

@@ -53,9 +54,20 @@ class Chunk {
  DISABLE_COPY_AND_ASSIGN(Chunk);
 };

-size_t CompressData(const char* in, size_t in_length, Compressor ct, char* out);
+class ChunkParser {
+ public:
+  explicit ChunkParser(std::istream& sin);
+
+  bool Init();
+  std::string Next();
+  bool HasNext() const;

-void DeflateData(const char* in, size_t in_length, Compressor ct, char* out);
+ private:
+  Header header_;
+  uint32_t pos_{0};
+  std::istream& in_;
+  std::unique_ptr<std::istream> compressed_stream_;
+};

 }  // namespace recordio
 }  // namespace paddle
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
@@ -22,35 +22,33 @@ namespace paddle {
 namespace recordio {

 Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
-    : stream_(std::move(stream)) {
+    : stream_(std::move(stream)), parser_(*stream_) {
  Reset();
 }

-Scanner::Scanner(const std::string &filename) {
-  stream_.reset(new std::ifstream(filename));
+Scanner::Scanner(const std::string &filename)
+    : stream_(new std::ifstream(filename)), parser_(*stream_) {
  Reset();
 }

 void Scanner::Reset() {
  stream_->clear();
  stream_->seekg(0, std::ios::beg);
-  ParseNextChunk();
+  parser_.Init();
 }

 std::string Scanner::Next() {
-  PADDLE_ENFORCE(!eof_, "StopIteration");
-  auto rec = cur_chunk_.Record(offset_++);
-  if (offset_ == cur_chunk_.NumRecords()) {
-    ParseNextChunk();
+  if (stream_->eof()) {
+    return "";
  }
-  return rec;
-}

-void Scanner::ParseNextChunk() {
-  eof_ = !cur_chunk_.Parse(*stream_);
-  offset_ = 0;
+  auto res = parser_.Next();
+  if (!parser_.HasNext() && HasNext()) {
+    parser_.Init();
+  }
+  return res;
 }

-bool Scanner::HasNext() const { return !eof_; }
+bool Scanner::HasNext() const { return !stream_->eof(); }
 }  // namespace recordio
 }  // namespace paddle
--- a/paddle/fluid/recordio/scanner.h
+++ b/paddle/fluid/recordio/scanner.h
@@ -37,11 +37,7 @@ class Scanner {

 private:
  std::unique_ptr<std::istream> stream_;
-  Chunk cur_chunk_;
-  size_t offset_;
-  bool eof_;
-
-  void ParseNextChunk();
+  ChunkParser parser_;
 };
 }  // namespace recordio
 }  // namespace paddle
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -145,19 +145,17 @@ function check_style() {
    trap 'abort' 0
    set -e

-    # install glide
-    curl https://glide.sh/get | bash
-    eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+    if [ -x "$(command -v gimme)" ]; then
+    	eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+    fi

    # set up go environment for running gometalinter
    mkdir -p $GOPATH/src/github.com/PaddlePaddle/
    ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle
-    cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
-
-    go get github.com/alecthomas/gometalinter
-    gometalinter --install
+    mkdir -p ./build/go
+    cp go/glide.* build/go
+    cd build/go; glide install; cd -

-    cd ${PADDLE_ROOT}
    export PATH=/usr/bin:$PATH
    pre-commit install
    clang-format --version

--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']


-def batch(reader, batch_size):
+def batch(reader, batch_size, drop_last=False):
    """
    Create a batched reader.

@@ -23,6 +23,8 @@ def batch(reader, batch_size):
    :type reader: callable
    :param batch_size: size of each mini-batch
    :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
    :return: the batched reader.
    :rtype: callable
    """
@@ -35,7 +37,7 @@ def batch(reader, batch_size):
            if len(b) == batch_size:
                yield b
                b = []
-        if b:
+        if drop_last == False and len(b) != 0:
            yield b

    return batch_reader
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -44,8 +44,8 @@ import transpiler
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
-from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, \
-    InferenceTranspiler, memory_optimize, release_memory
+from transpiler import DistributeTranspiler, InferenceTranspiler, \
+    memory_optimize, release_memory
 from concurrency import (Go, make_channel, channel_send, channel_recv,
                         channel_close, Select)
 from lod_tensor import create_lod_tensor, create_random_int_lodtensor

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -81,7 +81,7 @@ __all__ = [
    'label_smooth',
    'roi_pool',
    'dice_loss',
-    'upsampling_bilinear2d',
+    'resize_bilinear',
    'gather',
    'random_crop',
 ]
@@ -3929,9 +3929,9 @@ def dice_loss(input, label, epsilon=0.00001):
    return reduce_mean(dice_score)


-def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
+def resize_bilinear(input, out_shape=None, scale=None, name=None):
    """
-    The mathematical meaning of upsampling_bilinear2d is also called
+    The mathematical meaning of resize bilinear layer is
    Bilinear interpolation.
    Bilinear interpolation is an extension of linear interpolation for
    interpolating functions of two variables (e.g. H-direction and
@@ -3941,13 +3941,13 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
    https://en.wikipedia.org/wiki/Bilinear_interpolation

    Args:
-        input (Variable): The input tensor of bilinear interpolation,
+        input (Variable): The input tensor of resize bilinear layer,
                          This is a 4-D tensor of the shape
                          (num_batches, channels, in_h, in_w).
-        out_shape(list|tuple|Variable|None): Output shape of bilinear interpolation
+        out_shape(list|tuple|Variable|None): Output shape of resize bilinear
                                    layer, the shape is (out_h, out_w).
                                    Default: None
-        scale(int|None): The multiplier for the input height or width.
+        scale(float|None): The multiplier for the input height or width.
                         At least one of out_shape or scale must be set.
                         And out_shape has a higher priority than scale.
                         Default: None
@@ -3961,7 +3961,7 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
    Examples:
        .. code-block:: python

-            out = fluid.layers.bilinear_interp(input, out_shape=[12, 12])
+            out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
    """
    if out_shape is None and scale is None:
        raise ValueError("One of out_shape and scale must not be None")
@@ -3975,10 +3975,9 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
    out_w = 0
    inputs = {"X": input}
    if out_shape is not None:
-        if not (_is_list_or_turple_(out_shape) and len(out_shape) == 2) and (
-                out_shape is not Variable):
-            raise ValueError('out_shape should be a list or tuple ',
-                             'with length 2, (out_h, out_w).')
+        if not (_is_list_or_turple_(out_shape) and
+                len(out_shape) == 2) and not isinstance(out_shape, Variable):
+            raise ValueError('out_shape should be a list or tuple or variable')
        if _is_list_or_turple_(out_shape):
            out_shape = list(map(int, out_shape))
            out_h = out_shape[0]

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -71,6 +71,7 @@ __all__ = [
    'cumsum',
    'scatter',
    'sum',
+    'polygon_box_transform',
    'shape',
 ] + __activations__


--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import os
 import core
 import contextlib
-
-__all__ = ['convert_reader_to_recordio_file']
+__all__ = [
+    'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
+]


 @contextlib.contextmanager
@@ -46,3 +48,36 @@ def convert_reader_to_recordio_file(
            writer.complete_append_tensor()
            counter += 1
    return counter
+
+
+def convert_reader_to_recordio_files(
+        filename,
+        batch_per_file,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    f_name, f_ext = os.path.splitext(filename)
+    assert (f_ext == ".recordio")
+
+    lines = []
+    f_idx = 0
+    counter = 0
+    for idx, batch in enumerate(reader_creator()):
+        lines.append(batch)
+        if idx >= batch_per_file and idx % batch_per_file == 0:
+            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
+            with create_recordio_writer(filename, compressor,
+                                        max_num_records) as writer:
+                for l in lines:
+                    res = feeder.feed(l)
+                    for each in feed_order:
+                        writer.append_tensor(res[each])
+                    writer.complete_append_tensor()
+                    counter += 1
+                lines = []
+                f_idx += 1
+    return counter
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -48,13 +48,15 @@ def linear():
    return avg_loss


+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.001)
+
+
 def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    trainer = fluid.Trainer(
-        train_func=train_program,
-        place=place,
-        optimizer=fluid.optimizer.SGD(learning_rate=0.001))
+        train_func=train_program, place=place, optimizer_func=optimizer_func)

    def event_handler(event):
        if isinstance(event, fluid.EndStepEvent):

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -85,6 +85,10 @@ def train_network():
    return [avg_cost, accuracy]


+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
 def train(use_cuda, train_program, params_dirname):
    BATCH_SIZE = 128
    EPOCH_NUM = 1
@@ -111,9 +115,7 @@ def train(use_cuda, train_program, params_dirname):

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    trainer = fluid.Trainer(
-        train_func=train_program,
-        optimizer=fluid.optimizer.Adam(learning_rate=0.001),
-        place=place)
+        train_func=train_program, optimizer_func=optimizer_func, place=place)

    trainer.train(
        reader=train_reader,

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -64,6 +64,10 @@ def train_network():
    return [avg_cost, accuracy]


+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
 def train(use_cuda, train_program, params_dirname):
    BATCH_SIZE = 128
    train_reader = paddle.batch(
@@ -88,9 +92,7 @@ def train(use_cuda, train_program, params_dirname):

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    trainer = fluid.Trainer(
-        train_func=train_program,
-        place=place,
-        optimizer=fluid.optimizer.Adam(learning_rate=0.001))
+        train_func=train_program, place=place, optimizer_func=optimizer_func)

    trainer.train(
        reader=train_reader,

--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -141,12 +141,16 @@ def train_program():
    return [avg_cost]


+def optimize_func():
+    return fluid.optimizer.SGD(learning_rate=fluid.layers.exponential_decay(
+        learning_rate=0.01, decay_steps=100000, decay_rate=0.5, staircase=True))
+
+
 def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.SGD(learning_rate=0.01)

    trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer=optimizer)
+        train_func=train_program, place=place, optimizer_func=optimize_func)

    feed_order = [
        'word_data', 'ctx_n2_data', 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
@@ -245,7 +249,7 @@ def infer(use_cuda, inference_program, params_dirname):
        },
        return_numpy=False)

-    print("infer results: ", np.array(results[0]))
+    print("infer results: ", np.array(results[0]).shape)


 def main(use_cuda):

--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -158,6 +158,13 @@ def train_program(is_sparse):
    return avg_cost


+def optimizer_func():
+    return fluid.optimizer.Adagrad(
+        learning_rate=1e-4,
+        regularization=fluid.regularizer.L2DecayRegularizer(
+            regularization_coeff=0.1))
+
+
 def train(use_cuda, is_sparse, is_local=True):
    EPOCH_NUM = 1

@@ -182,11 +189,8 @@ def train(use_cuda, is_sparse, is_local=True):

    trainer = fluid.Trainer(
        train_func=partial(train_program, is_sparse),
-        optimizer=fluid.optimizer.Adagrad(
-            learning_rate=1e-4,
-            regularization=fluid.regularizer.L2DecayRegularizer(
-                regularization_coeff=0.1)),
-        place=place)
+        place=place,
+        optimizer_func=optimizer_func)

    trainer.train(
        reader=train_reader,

--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -57,14 +57,17 @@ def train_program():
    return [avg_cost, acc]


+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
 def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)

    trainer = fluid.Trainer(
        train_func=train_program,
        place=place,
-        optimizer=optimizer,
+        optimizer_func=optimizer_func,
        parallel=True)

    def event_handler(event):

--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -44,12 +44,15 @@ def train_program():
    return [avg_cost, acc]


+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
 def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)

    trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer=optimizer)
+        train_func=train_program, place=place, optimizer_func=optimizer_func)

    def event_handler(event):
        if isinstance(event, fluid.EndEpochEvent):

--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -155,12 +155,15 @@ def train_program():
    return [avg_cost, scale_infer]


+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.2)
+
+
 def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.SGD(learning_rate=0.2)

    trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer=optimizer)
+        train_func=train_program, place=place, optimizer_func=optimizer_func)

    feed_order = [
        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -64,15 +64,18 @@ def train_program(word_dict):
    return [avg_cost, accuracy]


+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
 def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)

    word_dict = paddle.dataset.imdb.word_dict()
    trainer = fluid.Trainer(
        train_func=partial(train_program, word_dict),
        place=place,
-        optimizer=optimizer)
+        optimizer_func=optimizer_func)

    def event_handler(event):
        if isinstance(event, fluid.EndEpochEvent):

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -79,15 +79,18 @@ def train_program(word_dict):
    return [avg_cost, accuracy]


+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
 def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)

    word_dict = paddle.dataset.imdb.word_dict()
    trainer = fluid.Trainer(
        train_func=partial(train_program, word_dict),
        place=place,
-        optimizer=optimizer)
+        optimizer_func=optimizer_func)

    def event_handler(event):
        if isinstance(event, fluid.EndEpochEvent):

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -71,15 +71,18 @@ def train_program(word_dict):
    return [avg_cost, accuracy]


+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
 def train(use_cuda, train_program, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)

    word_dict = paddle.dataset.imdb.word_dict()
    trainer = fluid.Trainer(
        train_func=partial(train_program, word_dict),
        place=place,
-        optimizer=optimizer)
+        optimizer_func=optimizer_func)

    def event_handler(event):
        if isinstance(event, fluid.EndEpochEvent):

--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -80,6 +80,10 @@ def train_program(is_sparse):
    return avg_cost


+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.001)
+
+
 def train(use_cuda, train_program, params_dirname):
    train_reader = paddle.batch(
        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
@@ -104,9 +108,7 @@ def train(use_cuda, train_program, params_dirname):
                sys.exit("got NaN loss, training failed.")

    trainer = fluid.Trainer(
-        train_func=train_program,
-        optimizer=fluid.optimizer.SGD(learning_rate=0.001),
-        place=place)
+        train_func=train_program, optimizer_func=optimizer_func, place=place)

    trainer.train(
        reader=train_reader,

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -80,21 +80,6 @@ def encoder_decoder():
    return rnn()


-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
 def main():
    rnn_out = encoder_decoder()
    label = layers.data(
@@ -122,18 +107,21 @@ def main():

    exe.run(framework.default_startup_program())

+    feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+
+    feed_list = [
+        fluid.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
    batch_id = 0
    for pass_id in xrange(10):
        for data in train_data():
-            word_data = to_lodtensor(map(lambda x: x[0], data), place)
-            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
            outs = exe.run(fluid.default_main_program(),
-                           feed={
-                               'src_word_id': word_data,
-                               'target_language_word': trg_word,
-                               'target_language_next_word': trg_word_next
-                           },
+                           feed=feeder.feed(data),
                           fetch_list=[avg_cost])
            avg_cost_val = np.array(outs[0])
            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -48,5 +48,7 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
 py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
-# tests that need to be done in fixed timeout
-set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+# FIXME(Yancey1989): this test would cost much more time on CUDAPlace
+# since load cudnn libraries, so we use a longer timeout to make this
+# unit test stability.
+set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 30)
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -19,7 +19,8 @@ import math
 from op_test import OpTest


-def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
+def box_coder(target_box, prior_box, prior_box_var, output_box, code_type,
+              box_normalized):
    prior_box_x = (
        (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0])
    prior_box_y = (
@@ -30,6 +31,9 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
        (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0])
    prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0],
                                          prior_box_var.shape[1])
+    if not box_normalized:
+        prior_box_height = prior_box_height + 1
+        prior_box_width = prior_box_width + 1

    if (code_type == "EncodeCenterSize"):
        target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape(
@@ -40,6 +44,9 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
            target_box.shape[0], 1)
        target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape(
            target_box.shape[0], 1)
+        if not box_normalized:
+            target_box_height = target_box_height + 1
+            target_box_width = target_box_width + 1

        output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
                prior_box_var[:,:,0]
@@ -64,9 +71,13 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
        output_box[:, :, 1] = target_box_y - target_box_height / 2
        output_box[:, :, 2] = target_box_x + target_box_width / 2
        output_box[:, :, 3] = target_box_y + target_box_height / 2
+        if not box_normalized:
+            output_box[:, :, 2] = output_box[:, :, 2] - 1
+            output_box[:, :, 3] = output_box[:, :, 3] - 1


-def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
+def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type,
+                    box_normalized):
    n = target_box.shape[0]
    m = prior_box.shape[0]
    output_box = np.zeros((n, m, 4), dtype=np.float32)
@@ -74,11 +85,11 @@ def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
        if (code_type == "EncodeCenterSize"):
            box_coder(target_box[lod[i]:lod[i + 1], :], prior_box,
                      prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
-                      code_type)
+                      code_type, box_normalized)
        elif (code_type == "DecodeCenterSize"):
            box_coder(target_box[lod[i]:lod[i + 1], :, :], prior_box,
                      prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
-                      code_type)
+                      code_type, box_normalized)
    return output_box


@@ -93,15 +104,19 @@ class TestBoxCoderOp(OpTest):
        prior_box_var = np.random.random((10, 4)).astype('float32')
        target_box = np.random.random((5, 10, 4)).astype('float32')
        code_type = "DecodeCenterSize"
+        box_normalized = False
        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type)
+                                     lod[0], code_type, box_normalized)

        self.inputs = {
            'PriorBox': prior_box,
            'PriorBoxVar': prior_box_var,
            'TargetBox': target_box,
        }
-        self.attrs = {'code_type': 'decode_center_size'}
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False
+        }
        self.outputs = {'OutputBox': output_box}


@@ -116,15 +131,16 @@ class TestBoxCoderOpWithLoD(OpTest):
        prior_box_var = np.random.random((10, 4)).astype('float32')
        target_box = np.random.random((20, 4)).astype('float32')
        code_type = "EncodeCenterSize"
+        box_normalized = True
        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type)
+                                     lod[0], code_type, box_normalized)

        self.inputs = {
            'PriorBox': prior_box,
            'PriorBoxVar': prior_box_var,
            'TargetBox': (target_box, lod),
        }
-        self.attrs = {'code_type': 'encode_center_size'}
+        self.attrs = {'code_type': 'encode_center_size', 'box_normalized': True}
        self.outputs = {'OutputBox': output_box}



--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -12,40 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import unittest
-
 import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
-import numpy
+
+from transpiler_test import TranspilerTest


-class TestDistTranspiler(unittest.TestCase):
+class TestDistTranspiler(TranspilerTest):
    def setUp(self):
-        self.trainer_id = 0
-        self.trainers = 2
-        self.pservers = 2
-        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
        self.current_pserver_ep = "127.0.0.1:6174"

-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'))
-
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-
-        optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-        return optimize_ops, params_grads
-
    def test_transpiler(self):
        trainer = self.get_trainer()
        pserver, startup = self.get_pserver(self.current_pserver_ep)
@@ -70,14 +46,6 @@ class TestDistTranspiler(unittest.TestCase):
        fc_w_var = startup.global_block().var("fc_w.block1")
        self.assertEqual(fc_w_var.shape, (500, 1000))

-    def get_main_program(self):
-        main = fluid.Program()
-
-        with fluid.program_guard(main):
-            self.net_conf()
-
-        return main
-
    def get_expect_trainer_ops(self):
        trainer = fluid.Program()

@@ -92,25 +60,6 @@ class TestDistTranspiler(unittest.TestCase):
        ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars")
        return ops

-    def get_trainer(self):
-        return self._transpiler_instance().get_trainer_program()
-
-    def get_pserver(self, ep):
-        t = self._transpiler_instance()
-        pserver = t.get_pserver_program(ep)
-        startup = t.get_startup_program(ep, pserver)
-        return pserver, startup
-
-    def _transpiler_instance(self):
-        main = self.get_main_program()
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            self.trainer_id,
-            program=main,
-            pservers=self.pserver_eps,
-            trainers=self.trainers)
-        return t
-

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -369,13 +369,21 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(output)
        print(str(program))

-    def test_upsampling_bilinear2d(self):
+    def test_resize_bilinear(self):
        program = Program()
        with program_guard(program):
            x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
-            output = layers.upsampling_bilinear2d(x, out_shape=[12, 12])
+            output = layers.resize_bilinear(x, out_shape=[12, 12])
            self.assertIsNotNone(output)
-            output = layers.upsampling_bilinear2d(x, scale=3)
+            output = layers.resize_bilinear(x, scale=3)
+            self.assertIsNotNone(output)
+        print(str(program))
+
+    def test_polygon_box_transform(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[8, 4, 4], dtype="float32")
+            output = layers.polygon_box_transform(input=x)
            self.assertIsNotNone(output)
        print(str(program))


--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -23,7 +23,7 @@ from multiprocessing import Process
 from op_test import OpTest


-def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id):
+def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
    y_predict = fluid.layers.fc(input=x, size=1, act=None)
    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
@@ -39,15 +39,8 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)

-    port = os.getenv("PADDLE_INIT_PORT", port)
-    pserver_ips = os.getenv("PADDLE_INIT_PSERVERS", ip)  # ip,ip...
-    eplist = []
-    for ip in pserver_ips.split(","):
-        eplist.append(':'.join([ip, port]))
-    pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-    trainers = int(os.getenv("TRAINERS", trainer_count))
-    current_endpoint = os.getenv("POD_IP", ip) + ":" + port
-    trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID", trainer_id))
+    pserver_endpoints = ip + ":" + port
+    current_endpoint = ip + ":" + port
    t = fluid.DistributeTranspiler()
    t.transpile(
        trainer_id,
@@ -62,47 +55,51 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id):

 class TestListenAndServOp(OpTest):
    def setUp(self):
-        self.sleep_time = 5
+        self.ps_timeout = 5
        self.ip = "127.0.0.1"
        self.port = "6173"
-        self.trainer_count = 1
+        self.trainers = 1
        self.trainer_id = 1

-    def _raise_signal(self, parent_pid, raised_signal):
-        time.sleep(self.sleep_time)
-        ps_command = subprocess.Popen(
-            "ps -o pid --ppid %d --noheaders" % parent_pid,
-            shell=True,
-            stdout=subprocess.PIPE)
-        ps_output = ps_command.stdout.read()
-        retcode = ps_command.wait()
-        assert retcode == 0, "ps command returned %d" % retcode
-
-        for pid_str in ps_output.split("\n")[:-1]:
-            try:
-                os.kill(int(pid_str), raised_signal)
-            except Exception:
-                continue
-
    def _start_pserver(self, use_cuda, sync_mode):
        p = Process(
            target=run_pserver,
-            args=(use_cuda, sync_mode, self.ip, self.port, self.trainer_count,
+            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
                  self.trainer_id))
        p.start()
+        return p.pid
+
+    def _wait_ps_ready(self, pid):
+        retry_times = self.ps_timeout
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(0.5)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def test_rpc_interfaces(self):
+        # TODO(Yancey1989): need to make sure the rpc interface correctly.
+        pass

    def test_handle_signal_in_serv_op(self):
        # run pserver on CPU in sync mode
-        self._start_pserver(False, True)
+        pid = self._start_pserver(False, True)
+        self._wait_ps_ready(pid)

-        # raise SIGINT to pserver
-        self._raise_signal(os.getpid(), signal.SIGINT)
+        # raise SIGTERM to pserver
+        os.kill(pid, signal.SIGTERM)

        # run pserver on CPU in async mode
-        self._start_pserver(False, False)
+        pid = self._start_pserver(False, False)
+        self._wait_ps_ready(pid)

        # raise SIGTERM to pserver
-        self._raise_signal(os.getpid(), signal.SIGTERM)
+        os.kill(pid, signal.SIGTERM)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid.transpiler.distribute_transpiler import delete_ops
+
+from transpiler_test import TranspilerTest
+
+
+class TestSimpleDistTranspiler(TranspilerTest):
+    def setUp(self):
+        self.current_pserver_ep = "127.0.0.1:6175"
+
+    def test_simple_transpiler(self):
+        np.random.seed(1)
+
+        trainer = self.get_trainer()
+        pserver, startup = self.get_pserver(self.current_pserver_ep)
+        self.assertEqual([op.type for op in trainer.global_block().ops],
+                         self.get_expect_trainer_ops())
+
+        self.assertEqual(len(pserver.blocks), 2)
+        # block0: listen_and_serv
+        self.assertEqual([op.type for op in pserver.blocks[0].ops],
+                         ["listen_and_serv"])
+        # block1: optimize pass
+        self.assertEqual([op.type for op in pserver.blocks[1].ops],
+                         ["sum", "scale", "sgd"])
+
+        # confirm startup program
+        self.assertEqual([op.type for op in startup.global_block().ops],
+                         ["fill_constant", "uniform_random", "uniform_random"])
+
+        # the variable #fc_w will NOT be splited
+        fc_w_var = startup.global_block().var("fc_w@GRAD")
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
+
+        fc_w_var = startup.global_block().var("fc_w@GRAD.trainer_0")
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
+
+    def get_expect_trainer_ops(self):
+        trainer = fluid.Program()
+
+        with fluid.program_guard(trainer):
+            optimize_ops, params_grads = self.net_conf()
+
+        delete_ops(trainer.global_block(), optimize_ops)
+        ops = [op.type for op in trainer.global_block().ops] + [
+            "send_vars", "send_barrier", "recv", "recv", "fetch_barrier"
+        ]
+        ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars")
+        return ops
+
+    def _transpiler_instance(self):
+        main = self.get_main_program()
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            self.trainer_id,
+            program=main,
+            pservers=self.pserver_eps,
+            trainers=self.trainers,
+            slice_var_up=False)
+        return t
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_split_var.py
+++ b/python/paddle/fluid/tests/unittests/test_split_var.py
@@ -14,14 +14,14 @@

 import math
 import unittest
-from paddle.fluid.transpiler.distribute_transpiler import split_variable
+from paddle.fluid.transpiler.distribute_transpiler import slice_variable
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import random


-class TestSplitVar(unittest.TestCase):
-    def check_split_output(self, shapes, expected_sizes, min_size):
+class TestSliceVar(unittest.TestCase):
+    def check_slice_output(self, shapes, expected_sizes, min_size):
        var_list = []
        program = fluid.Program()
        for shape in shapes:
@@ -31,7 +31,7 @@ class TestSplitVar(unittest.TestCase):
                # dtype=core.VarDesc.VarType.LOD_TENSOR,
                shape=shape)
            var_list.append(var)
-        blocks = split_variable(var_list, 10, min_size)
+        blocks = slice_variable(var_list, 10, min_size)
        all_sizes = []
        for s in expected_sizes:
            for s2 in s:
@@ -49,7 +49,7 @@ class TestSplitVar(unittest.TestCase):
            [1150, 1150, 1150, 1150, 1150, 1150, 1100]
        ]

-        self.check_split_output(shapes, expected_sizes, 1024)
+        self.check_slice_output(shapes, expected_sizes, 1024)

    def test_check_output_8k(self):
        shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10],
@@ -57,7 +57,7 @@ class TestSplitVar(unittest.TestCase):
        expected_sizes = [[15], [1024], [10976, 10976], [8160], [8000],
                          [35937, 35937, 35937, 35937, 35937, 35937]]

-        self.check_split_output(shapes, expected_sizes, 8192)
+        self.check_slice_output(shapes, expected_sizes, 8192)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/transpiler_test.py
+++ b/python/paddle/fluid/tests/unittests/transpiler_test.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.layers as layers
+
+
+class TranspilerTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.trainer_id = 0
+        self.trainers = 2
+        self.pservers = 2
+        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
+
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'))
+
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+
+        optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+        return optimize_ops, params_grads
+
+    def get_main_program(self):
+        main = fluid.Program()
+
+        with fluid.program_guard(main):
+            self.net_conf()
+
+        return main
+
+    def get_trainer(self):
+        return self._transpiler_instance().get_trainer_program()
+
+    def get_pserver(self, ep):
+        t = self._transpiler_instance()
+        pserver = t.get_pserver_program(ep)
+        startup = t.get_startup_program(ep, pserver)
+        return pserver, startup
+
+    def _transpiler_instance(self):
+        main = self.get_main_program()
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            self.trainer_id,
+            program=main,
+            pservers=self.pserver_eps,
+            trainers=self.trainers)
+        return t
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -90,13 +90,13 @@ class Trainer(object):

    Args:
        train_func(callable): A function which will return loss. The loss must be a scalar.
-        optimizer(optimizer.Optimizer): The optimizer should be an instance of Optimizer
+        optimizer_func(callable): A function that returns an Optimizer object.
        place: The device place of this trainer.
    """

    def __init__(self,
                 train_func,
-                 optimizer,
+                 optimizer_func,
                 param_path=None,
                 place=None,
                 parallel=False):
@@ -105,8 +105,6 @@ class Trainer(object):
        # 1. we need to generate a framework.Program by calling
        # program_func. Reference: fluid.program_guard in
        # test_word2vec.py
-        if not isinstance(optimizer, opt_module.Optimizer):
-            raise TypeError("The optimizer should be an instance of Optimizer")

        self.scope = core.Scope()

@@ -118,11 +116,14 @@ class Trainer(object):
            self.train_func_outputs = program_func_outs if isinstance(
                program_func_outs, list) else [program_func_outs]
            self.test_program = self.train_program.clone()
+
+            # The fisrt element of program_func_outs is loss.
+            loss = self.train_func_outputs[0]
+
+            optimizer = optimizer_func()
            if not isinstance(optimizer, opt_module.Optimizer):
                raise TypeError(
                    "The optimizer should be an instance of Optimizer")
-            # The fisrt element of program_func_outs is loss.
-            loss = self.train_func_outputs[0]
            optimize_ops, params_grads = optimizer.minimize(loss)

        self.place = check_and_get_place(place)

--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -15,10 +15,9 @@
 from distribute_transpiler import DistributeTranspiler
 from inference_transpiler import InferenceTranspiler
 from memory_optimization_transpiler import memory_optimize, release_memory
-from distribute_transpiler_simple import SimpleDistributeTranspiler
 from ps_dispatcher import HashName, RoundRobin

 __all__ = [
-    "DistributeTranspiler", "InferenceTranspiler", "SimpleDistributeTranspiler",
-    "memory_optimize", "release_memory", "HashName", "RoundRobin"
+    "DistributeTranspiler", "InferenceTranspiler", "memory_optimize",
+    "release_memory", "HashName", "RoundRobin"
 ]
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -39,6 +39,7 @@ Steps to transpile pserver:
 from __future__ import print_function

 import math
+import numpy as np

 from ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
@@ -70,7 +71,7 @@ def same_or_split_var(p_name, var_name):
    return p_name == var_name or p_name.startswith(var_name + ".block")


-def split_variable(var_list, service_count, min_block_size=8192):
+def slice_variable(var_list, slice_count, min_block_size=8192):
    """
    We may need to split dense tensor to one or more blocks and put
    them equally onto parameter server. One block is a sub-tensor
@@ -78,25 +79,25 @@ def split_variable(var_list, service_count, min_block_size=8192):

    We need to have a minimal block size so that the calculations in
    the parameter server side can gain better performance. By default
-    minimum block size 8K elements (maybe 16bit or 32bit or 64bit). 
+    minimum block size 8K elements (maybe 16bit or 32bit or 64bit).

    Args:
        var_list (list): List of variables.
-        service_count (int): Numel of pserver services. A pserver may have two
-            or more listening ports.
+        slice_count (int): Numel of count that variables will be sliced, which
+            could be the pserver services' count.
        min_block_size (int): Minimum splitted block size.
    Returns:
-        blocks (list[(varname, block_id, current_block_size)]): A list 
+        blocks (list[(varname, block_id, current_block_size)]): A list
            of VarBlocks. Each VarBlock specifies a shard of the var.
    """
    blocks = []
    for var in var_list:
-        split_count = service_count
+        split_count = slice_count
        var_numel = reduce(lambda x, y: x * y, var.shape)
        max_pserver_count = int(math.floor(var_numel / float(min_block_size)))
        if max_pserver_count == 0:
            max_pserver_count = 1
-        if max_pserver_count < service_count:
+        if max_pserver_count < slice_count:
            split_count = max_pserver_count
        block_size = int(math.ceil(var_numel / float(split_count)))

@@ -177,7 +178,7 @@ class DistributeTranspiler:
                    for index in range(len(self.pserver_endpoints))
                ]

-    def _init_splited_vars(self, split_method):
+    def _init_splited_vars(self, slice_var_up):
        # update these mappings for further transpile:
        # 1. param_var_mapping: param var name -> [splited params vars]
        # 2. grad_var_mapping: grad var name -> [splited grads vars]
@@ -186,19 +187,34 @@ class DistributeTranspiler:

        param_list = []
        grad_list = []
+        param_grad_set = set()
        for p, g in self.params_grads:
            # skip parameter marked not trainable
            if type(p) == Parameter and p.trainable == False:
                continue
-            param_list.append(p)
-            grad_list.append(g)
+            if p.name not in param_grad_set:
+                param_list.append(p)
+                param_grad_set.add(p.name)
+            if g.name not in param_grad_set:
+                grad_list.append(g)
+                param_grad_set.add(g.name)

        self._update_dist_lookup_table_vars(param_list, grad_list,
                                            self.params_grads)

-        grad_blocks = split_variable(grad_list, len(self.pserver_endpoints))
-        param_blocks = split_variable(param_list, len(self.pserver_endpoints))
+        if slice_var_up:
+            # when we slice var up into blocks, we will slice the var according to
+            # pserver services' count. A pserver may have two or more listening ports.
+            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
+            param_blocks = slice_variable(param_list,
+                                          len(self.pserver_endpoints))
+        else:
+            # when we do NOT slice var up into blocks, we will always slice params
+            # grads into one block.
+            grad_blocks = slice_variable(grad_list, 1)
+            param_blocks = slice_variable(param_list, 1)
        assert (len(grad_blocks) == len(param_blocks))
+
        # origin_varname -> [splited_var]
        self.param_var_mapping = self._create_vars_from_blocklist(
            self.origin_program, param_blocks)
@@ -229,6 +245,7 @@ class DistributeTranspiler:
                  program=None,
                  pservers="127.0.0.1:6174",
                  trainers=1,
+                  slice_var_up=True,
                  split_method=RoundRobin,
                  sync_mode=True):
        """
@@ -262,13 +279,27 @@ class DistributeTranspiler:
        self.has_distributed_lookup_table = self._has_distributed_lookup_table()

        # split and create vars, then put splited vars in dicts for later use.
-        self._init_splited_vars(split_method)
+        self._init_splited_vars(slice_var_up)

        # step 3.1: insert send op to send gradient vars to parameter servers
        ps_dispatcher.reset()
        send_vars = []
-        for orig_varname, splited_vars in self.grad_var_mapping.items():
+
+        # in general cases, the number of pservers is times of 2, and this
+        # will lead to uneven distribution among weights and bias:
+        #       fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
+        #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
+        # shuffle the map will avoid the uneven distribution above
+        grad_var_mapping_items = self.grad_var_mapping.items()
+        if not slice_var_up:
+            np.random.shuffle(grad_var_mapping_items)
+
+        for orig_varname, splited_vars in grad_var_mapping_items:
            eplist = ps_dispatcher.dispatch(splited_vars)
+
+            if not slice_var_up:
+                assert (len(splited_vars) == 1)
+
            if len(splited_vars) == 1:
                orig_varname = splited_vars[0].name
                index = find_op_by_output_arg(program.global_block(),
@@ -316,6 +347,7 @@ class DistributeTranspiler:
        for i, ep in enumerate(eplist):
            self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
            self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
+
        # step4: Concat the parameters splits together after recv.
        for varname, splited_var in self.param_var_mapping.iteritems():
            eps = []
@@ -788,8 +820,8 @@ class DistributeTranspiler:
            program (ProgramDesc): ProgramDesc which gradients blong.
            block_list (list[(varname, block_id, block_size)]): List of gradient blocks.
            add_trainer_suffix (Bool): Add trainer suffix to new variable's name if set True.
-        Returns: 
-            var_mapping (dict(varname->[new_varname_variable])):A dict mapping 
+        Returns:
+            var_mapping (dict(varname->[new_varname_variable])):A dict mapping
                from original var name to each var split.
        """

@@ -802,6 +834,9 @@ class DistributeTranspiler:
            if not block_map.has_key(varname):
                block_map[varname] = []
            block_map[varname].append((long(offset), long(size)))
+        # Do not remove this important debug message:
+        print("block map: %s" % block_map)
+
        for varname, splited in block_map.iteritems():
            orig_var = program.global_block().var(varname)
            if len(splited) == 1:

--- a/python/paddle/fluid/transpiler/distribute_transpiler_simple.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler_simple.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..framework import Program, default_main_program, Parameter, Variable
-from ..layer_helper import LayerHelper
-
-
-def hash_name_to_server(params_grads, pserver_endpoints):
-    """
-    :param param_grads:
-    :return: a map of pserver endpoint -> 
-                    params -> [param list]
-                    grads  -> [grad list]
-    """
-
-    def _hash_param(param_name, total):
-        return hash(param_name) % total
-
-    param_grad_map = dict()
-    for param, grad in params_grads:
-        if param.trainable is True and grad is not None:
-            server_id = _hash_param(param.name, len(pserver_endpoints))
-            server_for_param = pserver_endpoints[server_id]
-            if not param_grad_map.has_key(server_for_param):
-                param_grad_map[server_for_param] = {"params": [], "grads": []}
-            param_grad_map[server_for_param]["params"].append(param)
-            param_grad_map[server_for_param]["grads"].append(grad)
-
-    return param_grad_map
-
-
-def round_robin(params_grads, pserver_endpoints):
-    assert (len(params_grads) > len(pserver_endpoints))
-
-    param_grad_map = dict()
-    pserver_idx = 0
-    for param, grad in params_grads:
-        if param.trainable is True:
-            server_for_param = pserver_endpoints[pserver_idx]
-            if not param_grad_map.has_key(server_for_param):
-                param_grad_map[server_for_param] = {"params": [], "grads": []}
-
-            param_grad_map[server_for_param]["params"].append(param)
-            param_grad_map[server_for_param]["grads"].append(grad)
-
-            pserver_idx += 1
-            if pserver_idx >= len(pserver_endpoints):
-                pserver_idx = 0
-    return param_grad_map
-
-
-class SimpleDistributeTranspiler:
-    def transpile(self,
-                  optimize_ops,
-                  params_grads,
-                  program=None,
-                  pservers="127.0.0.1:6174",
-                  trainers=1,
-                  split_method=round_robin):
-        """
-            Transpile the program to a distributed data-parallelism programs.
-
-            The main_program will be transform to use a remote parameter server
-            to do parameter optimization. And the optimization graph will be put
-            in to a parameter server program.
-
-            Use different methods to split trainable varialbles to different
-            parameter servers.
-
-            Example to run:
-
-            exe = fluid.Executor(place)
-            t = fluid.DistributeTranspiler()
-            t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
-
-            pserver_endpoint = os.getenv("PSERVER")
-            if pserver_endpoint:
-                pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
-                exe.run(fluid.default_startup_program())
-                exe.run(pserver_prog)
-            else:
-                feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-                exe.run(fluid.default_startup_program())
-
-                for pass_id in range(PASS_NUM):
-                    ...
-
-            :param optimize_ops: op list of optimization, should be the
-                                 return value of Optimizer.minimize
-            :type optimize_ops: list
-            :param program: program to optimize, default default_main_program
-            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
-            :type pservers: string
-
-            :return: return a list of programs
-        """
-        if program is None:
-            program = default_main_program()
-        self.program = program
-        self.trainers = trainers
-        self.optimize_ops = optimize_ops
-        self._optimize_distributed(
-            optimize_ops,
-            program,
-            params_grads,
-            pservers=pservers,
-            trainers=trainers,
-            split_method=split_method)
-
-    def _clone_param(self, block, v):
-        assert isinstance(v, Parameter)
-        new_p = Parameter(
-            block=block,
-            shape=v.shape,
-            dtype=v.dtype,
-            type=v.type,
-            lod_level=v.lod_level,
-            stop_gradient=v.stop_gradient,
-            trainable=v.trainable,
-            optimize_attr=v.optimize_attr,
-            regularizer=v.regularizer,
-            name=v.name)
-        block.vars[new_p.name] = new_p
-
-    def _clone_var(self, block, var):
-        assert isinstance(var, Variable)
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            lod_level=var.lod_level,
-            persistable=var.persistable)
-
-    def _optimize_distributed(self, optimize_ops, program, params_and_grads,
-                              **kwargs):
-        if kwargs.has_key("split_method"):
-            split_method = kwargs["split_method"]
-        else:
-            split_method = round_robin
-
-        assert (callable(split_method))
-        pserver_endpoints = kwargs["pservers"].split(",")
-        self.param_grad_map = split_method(params_and_grads, pserver_endpoints)
-
-        send_op_ordered_inputs = []
-        send_op_ordered_outputs = []
-        epmap = []
-        for ep, v in self.param_grad_map.iteritems():
-            send_op_ordered_inputs.extend(v["grads"])
-            send_op_ordered_outputs.extend(v["params"])
-            for i in v["grads"]:
-                epmap.append(ep)
-        send_op = program.global_block().append_op(
-            type="send",
-            inputs={"X": send_op_ordered_inputs
-                    },  # inputs is a list of tensors to be send
-            outputs={"Out": send_op_ordered_outputs},
-            attrs={"endpoints": pserver_endpoints,
-                   "epmap": epmap})
-
-    def get_trainer_program(self):
-        # remove optimize ops and add a send op to main_program
-        self.program.global_block().delete_ops(self.optimize_ops)
-        return self.program
-
-    def _create_var_for_trainers(self, block, var, trainers):
-        var_list = []
-        for i in xrange(trainers):
-            var_each = block.create_var(
-                name="%s.trainer_%d" % (var.name, i),
-                psersistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-            var_list.append(var_each)
-        return var_list
-
-    def get_pserver_program(self, endpoint, optimize_ops):
-        pserver_program = Program()
-        for v in self.param_grad_map[endpoint]["params"]:
-            self._clone_param(pserver_program.global_block(), v)
-
-        optimize_sub_program = Program()
-        grad_var_names = [
-            var.name for var in self.param_grad_map[endpoint]["grads"]
-        ]
-        for opt_op in optimize_ops:
-            for _, var in opt_op.inputs.iteritems():
-                # NOTE: append operators to merge gradients from multiple
-                # trainers. If trainers == 1, this is not needed.
-                if self.trainers > 1 and var.name in grad_var_names:
-                    vars2merge = self._create_var_for_trainers(
-                        optimize_sub_program.global_block(), var, self.trainers)
-                    merged_var = optimize_sub_program.global_block().create_var(
-                        name=var.name,
-                        persistable=var.persistable,
-                        dtype=var.dtype,
-                        shape=var.shape)
-                    optimize_sub_program.global_block().append_op(
-                        type="sum",
-                        inputs={"X": vars2merge},
-                        outputs={"Out": merged_var})
-                    optimize_sub_program.global_block().append_op(
-                        type="scale",
-                        inputs={"X": merged_var},
-                        outputs={"Out": merged_var},
-                        attrs={"scale": 1.0 / float(self.trainers)})
-                else:
-                    optimize_sub_program.global_block().create_var(
-                        name=var.name,
-                        persistable=var.persistable,
-                        dtype=var.dtype,
-                        shape=var.shape)
-
-            if opt_op.inputs.has_key("Grad"):
-                if opt_op.inputs["Grad"].name in grad_var_names:
-                    optimize_sub_program.global_block().append_op(
-                        type=opt_op.type,
-                        inputs=opt_op.inputs,
-                        outputs=opt_op.outputs,
-                        attrs=opt_op.attrs)
-            else:
-                optimize_sub_program.global_block().append_op(
-                    type=opt_op.type,
-                    inputs=opt_op.inputs,
-                    outputs=opt_op.outputs,
-                    attrs=opt_op.attrs)
-        pserver_program.global_block().append_op(
-            type="recv",
-            inputs={"RX":
-                    self.param_grad_map[endpoint]["grads"]},  # grads to recv
-            outputs={},
-            attrs={
-                "OptimizeBlock": optimize_sub_program.global_block(),
-                "endpoint": endpoint,
-                "ParamList":
-                [p.name for p in self.param_grad_map[endpoint]["params"]],
-                "GradList":
-                [p.name for p in self.param_grad_map[endpoint]["grads"]],
-                "Trainers": self.trainers
-            })
-        pserver_program.sync_with_cpp()
-        return pserver_program
--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']


-def batch(reader, batch_size):
+def batch(reader, batch_size, drop_last=False):
    """
    Create a batched reader.

@@ -23,6 +23,8 @@ def batch(reader, batch_size):
    :type reader: callable
    :param batch_size: size of each mini-batch
    :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
    :return: the batched reader.
    :rtype: callable
    """
@@ -35,7 +37,7 @@ def batch(reader, batch_size):
            if len(b) == batch_size:
                yield b
                b = []
-        if b:
+        if drop_last == False and len(b) != 0:
            yield b

    return batch_reader
--- a/tools/codestyle/.gitignore
+++ b/tools/codestyle/.gitignore
+*.pyc